xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 7cd49bdee00a26e142bff6e7df7c3fb9209fa795)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
2 #include <petsc/private/vecimpl.h>
3 #include <petsc/private/sfimpl.h>
4 #include <petsc/private/isimpl.h>
5 #include <petscblaslapack.h>
6 #include <petscsf.h>
7 #include <petsc/private/hashmapi.h>
8 
9 /* defines MatSetValues_MPI_Hash(), MatAssemblyBegin_MPI_Hash(), and MatAssemblyEnd_MPI_Hash() */
10 #define TYPE AIJ
11 #define TYPE_AIJ
12 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
13 #undef TYPE
14 #undef TYPE_AIJ
15 
16 static PetscErrorCode MatReset_MPIAIJ(Mat mat)
17 {
18   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
19 
20   PetscFunctionBegin;
21   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
22   PetscCall(MatStashDestroy_Private(&mat->stash));
23   PetscCall(VecDestroy(&aij->diag));
24   PetscCall(MatDestroy(&aij->A));
25   PetscCall(MatDestroy(&aij->B));
26 #if defined(PETSC_USE_CTABLE)
27   PetscCall(PetscHMapIDestroy(&aij->colmap));
28 #else
29   PetscCall(PetscFree(aij->colmap));
30 #endif
31   PetscCall(PetscFree(aij->garray));
32   PetscCall(VecDestroy(&aij->lvec));
33   PetscCall(VecScatterDestroy(&aij->Mvctx));
34   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
35   PetscCall(PetscFree(aij->ld));
36   PetscFunctionReturn(PETSC_SUCCESS);
37 }
38 
39 static PetscErrorCode MatResetHash_MPIAIJ(Mat mat)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
42   /* Save the nonzero states of the component matrices because those are what are used to determine
43     the nonzero state of mat */
44   PetscObjectState Astate = aij->A->nonzerostate, Bstate = aij->B->nonzerostate;
45 
46   PetscFunctionBegin;
47   PetscCall(MatReset_MPIAIJ(mat));
48   PetscCall(MatSetUp_MPI_Hash(mat));
49   aij->A->nonzerostate = ++Astate, aij->B->nonzerostate = ++Bstate;
50   PetscFunctionReturn(PETSC_SUCCESS);
51 }
52 
53 PetscErrorCode MatDestroy_MPIAIJ(Mat mat)
54 {
55   PetscFunctionBegin;
56   PetscCall(MatReset_MPIAIJ(mat));
57 
58   PetscCall(PetscFree(mat->data));
59 
60   /* may be created by MatCreateMPIAIJSumSeqAIJSymbolic */
61   PetscCall(PetscObjectCompose((PetscObject)mat, "MatMergeSeqsToMPI", NULL));
62 
63   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
64   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatStoreValues_C", NULL));
65   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatRetrieveValues_C", NULL));
66   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatIsTranspose_C", NULL));
67   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocation_C", NULL));
68   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetPreallocation_C", NULL));
69   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetHash_C", NULL));
70   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocationCSR_C", NULL));
71   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDiagonalScaleLocal_C", NULL));
72   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpibaij_C", NULL));
73   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisbaij_C", NULL));
74 #if defined(PETSC_HAVE_CUDA)
75   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcusparse_C", NULL));
76 #endif
77 #if defined(PETSC_HAVE_HIP)
78   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijhipsparse_C", NULL));
79 #endif
80 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
81   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijkokkos_C", NULL));
82 #endif
83   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
84 #if defined(PETSC_HAVE_ELEMENTAL)
85   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_elemental_C", NULL));
86 #endif
87 #if defined(PETSC_HAVE_SCALAPACK) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
88   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_scalapack_C", NULL));
89 #endif
90 #if defined(PETSC_HAVE_HYPRE)
91   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_hypre_C", NULL));
92   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", NULL));
93 #endif
94   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
95   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_is_mpiaij_C", NULL));
96   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpiaij_C", NULL));
97   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetUseScalableIncreaseOverlap_C", NULL));
98   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijperm_C", NULL));
99   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijsell_C", NULL));
100 #if defined(PETSC_HAVE_MKL_SPARSE)
101   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijmkl_C", NULL));
102 #endif
103   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcrl_C", NULL));
104   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
105   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisell_C", NULL));
106   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetPreallocationCOO_C", NULL));
107   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetValuesCOO_C", NULL));
108   PetscFunctionReturn(PETSC_SUCCESS);
109 }
110 
111 static PetscErrorCode MatGetRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
112 {
113   Mat B;
114 
115   PetscFunctionBegin;
116   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &B));
117   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject)B));
118   PetscCall(MatGetRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
119   PetscCall(MatDestroy(&B));
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 static PetscErrorCode MatRestoreRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
124 {
125   Mat B;
126 
127   PetscFunctionBegin;
128   PetscCall(PetscObjectQuery((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject *)&B));
129   PetscCall(MatRestoreRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
130   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", NULL));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 /*MC
135    MATAIJ - MATAIJ = "aij" - A matrix type to be used for sparse matrices.
136 
137    This matrix type is identical to` MATSEQAIJ` when constructed with a single process communicator,
138    and `MATMPIAIJ` otherwise.  As a result, for single process communicators,
139   `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
140   for communicators controlling multiple processes.  It is recommended that you call both of
141   the above preallocation routines for simplicity.
142 
143    Options Database Key:
144 . -mat_type aij - sets the matrix type to `MATAIJ` during a call to `MatSetFromOptions()`
145 
146   Developer Note:
147   Level: beginner
148 
149     Subclasses include `MATAIJCUSPARSE`, `MATAIJPERM`, `MATAIJSELL`, `MATAIJMKL`, `MATAIJCRL`, `MATAIJKOKKOS`,and also automatically switches over to use inodes when
150    enough exist.
151 
152 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateAIJ()`, `MatCreateSeqAIJ()`, `MATSEQAIJ`, `MATMPIAIJ`
153 M*/
154 
155 /*MC
156    MATAIJCRL - MATAIJCRL = "aijcrl" - A matrix type to be used for sparse matrices.
157 
158    This matrix type is identical to `MATSEQAIJCRL` when constructed with a single process communicator,
159    and `MATMPIAIJCRL` otherwise.  As a result, for single process communicators,
160    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
161   for communicators controlling multiple processes.  It is recommended that you call both of
162   the above preallocation routines for simplicity.
163 
164    Options Database Key:
165 . -mat_type aijcrl - sets the matrix type to `MATMPIAIJCRL` during a call to `MatSetFromOptions()`
166 
167   Level: beginner
168 
169 .seealso: [](ch_matrices), `Mat`, `MatCreateMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`
170 M*/
171 
172 static PetscErrorCode MatBindToCPU_MPIAIJ(Mat A, PetscBool flg)
173 {
174   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
175 
176   PetscFunctionBegin;
177 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_VIENNACL)
178   A->boundtocpu = flg;
179 #endif
180   if (a->A) PetscCall(MatBindToCPU(a->A, flg));
181   if (a->B) PetscCall(MatBindToCPU(a->B, flg));
182 
183   /* In addition to binding the diagonal and off-diagonal matrices, bind the local vectors used for matrix-vector products.
184    * This maybe seems a little odd for a MatBindToCPU() call to do, but it makes no sense for the binding of these vectors
185    * to differ from the parent matrix. */
186   if (a->lvec) PetscCall(VecBindToCPU(a->lvec, flg));
187   if (a->diag) PetscCall(VecBindToCPU(a->diag, flg));
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSetBlockSizes_MPIAIJ(Mat M, PetscInt rbs, PetscInt cbs)
192 {
193   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)M->data;
194 
195   PetscFunctionBegin;
196   if (mat->A) {
197     PetscCall(MatSetBlockSizes(mat->A, rbs, cbs));
198     PetscCall(MatSetBlockSizes(mat->B, rbs, 1));
199   }
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
203 static PetscErrorCode MatFindNonzeroRows_MPIAIJ(Mat M, IS *keptrows)
204 {
205   Mat_MPIAIJ      *mat = (Mat_MPIAIJ *)M->data;
206   Mat_SeqAIJ      *a   = (Mat_SeqAIJ *)mat->A->data;
207   Mat_SeqAIJ      *b   = (Mat_SeqAIJ *)mat->B->data;
208   const PetscInt  *ia, *ib;
209   const MatScalar *aa, *bb, *aav, *bav;
210   PetscInt         na, nb, i, j, *rows, cnt = 0, n0rows;
211   PetscInt         m = M->rmap->n, rstart = M->rmap->rstart;
212 
213   PetscFunctionBegin;
214   *keptrows = NULL;
215 
216   ia = a->i;
217   ib = b->i;
218   PetscCall(MatSeqAIJGetArrayRead(mat->A, &aav));
219   PetscCall(MatSeqAIJGetArrayRead(mat->B, &bav));
220   for (i = 0; i < m; i++) {
221     na = ia[i + 1] - ia[i];
222     nb = ib[i + 1] - ib[i];
223     if (!na && !nb) {
224       cnt++;
225       goto ok1;
226     }
227     aa = aav + ia[i];
228     for (j = 0; j < na; j++) {
229       if (aa[j] != 0.0) goto ok1;
230     }
231     bb = PetscSafePointerPlusOffset(bav, ib[i]);
232     for (j = 0; j < nb; j++) {
233       if (bb[j] != 0.0) goto ok1;
234     }
235     cnt++;
236   ok1:;
237   }
238   PetscCallMPI(MPIU_Allreduce(&cnt, &n0rows, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)M)));
239   if (!n0rows) {
240     PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
241     PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
242     PetscFunctionReturn(PETSC_SUCCESS);
243   }
244   PetscCall(PetscMalloc1(M->rmap->n - cnt, &rows));
245   cnt = 0;
246   for (i = 0; i < m; i++) {
247     na = ia[i + 1] - ia[i];
248     nb = ib[i + 1] - ib[i];
249     if (!na && !nb) continue;
250     aa = aav + ia[i];
251     for (j = 0; j < na; j++) {
252       if (aa[j] != 0.0) {
253         rows[cnt++] = rstart + i;
254         goto ok2;
255       }
256     }
257     bb = PetscSafePointerPlusOffset(bav, ib[i]);
258     for (j = 0; j < nb; j++) {
259       if (bb[j] != 0.0) {
260         rows[cnt++] = rstart + i;
261         goto ok2;
262       }
263     }
264   ok2:;
265   }
266   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), cnt, rows, PETSC_OWN_POINTER, keptrows));
267   PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
268   PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
269   PetscFunctionReturn(PETSC_SUCCESS);
270 }
271 
272 static PetscErrorCode MatDiagonalSet_MPIAIJ(Mat Y, Vec D, InsertMode is)
273 {
274   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)Y->data;
275   PetscBool   cong;
276 
277   PetscFunctionBegin;
278   PetscCall(MatHasCongruentLayouts(Y, &cong));
279   if (Y->assembled && cong) {
280     PetscCall(MatDiagonalSet(aij->A, D, is));
281   } else {
282     PetscCall(MatDiagonalSet_Default(Y, D, is));
283   }
284   PetscFunctionReturn(PETSC_SUCCESS);
285 }
286 
287 static PetscErrorCode MatFindZeroDiagonals_MPIAIJ(Mat M, IS *zrows)
288 {
289   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)M->data;
290   PetscInt    i, rstart, nrows, *rows;
291 
292   PetscFunctionBegin;
293   *zrows = NULL;
294   PetscCall(MatFindZeroDiagonals_SeqAIJ_Private(aij->A, &nrows, &rows));
295   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
296   for (i = 0; i < nrows; i++) rows[i] += rstart;
297   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), nrows, rows, PETSC_OWN_POINTER, zrows));
298   PetscFunctionReturn(PETSC_SUCCESS);
299 }
300 
301 static PetscErrorCode MatGetColumnReductions_MPIAIJ(Mat A, PetscInt type, PetscReal *reductions)
302 {
303   Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)A->data;
304   PetscInt           i, m, n, *garray = aij->garray;
305   Mat_SeqAIJ        *a_aij = (Mat_SeqAIJ *)aij->A->data;
306   Mat_SeqAIJ        *b_aij = (Mat_SeqAIJ *)aij->B->data;
307   PetscReal         *work;
308   const PetscScalar *dummy;
309 
310   PetscFunctionBegin;
311   PetscCall(MatGetSize(A, &m, &n));
312   PetscCall(PetscCalloc1(n, &work));
313   PetscCall(MatSeqAIJGetArrayRead(aij->A, &dummy));
314   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &dummy));
315   PetscCall(MatSeqAIJGetArrayRead(aij->B, &dummy));
316   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &dummy));
317   if (type == NORM_2) {
318     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i] * a_aij->a[i]);
319     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i] * b_aij->a[i]);
320   } else if (type == NORM_1) {
321     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i]);
322     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i]);
323   } else if (type == NORM_INFINITY) {
324     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] = PetscMax(PetscAbsScalar(a_aij->a[i]), work[A->cmap->rstart + a_aij->j[i]]);
325     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] = PetscMax(PetscAbsScalar(b_aij->a[i]), work[garray[b_aij->j[i]]]);
326   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
327     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscRealPart(a_aij->a[i]);
328     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscRealPart(b_aij->a[i]);
329   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
330     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscImaginaryPart(a_aij->a[i]);
331     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscImaginaryPart(b_aij->a[i]);
332   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
333   if (type == NORM_INFINITY) {
334     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
335   } else {
336     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
337   }
338   PetscCall(PetscFree(work));
339   if (type == NORM_2) {
340     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
341   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
342     for (i = 0; i < n; i++) reductions[i] /= m;
343   }
344   PetscFunctionReturn(PETSC_SUCCESS);
345 }
346 
347 static PetscErrorCode MatFindOffBlockDiagonalEntries_MPIAIJ(Mat A, IS *is)
348 {
349   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
350   IS              sis, gis;
351   const PetscInt *isis, *igis;
352   PetscInt        n, *iis, nsis, ngis, rstart, i;
353 
354   PetscFunctionBegin;
355   PetscCall(MatFindOffBlockDiagonalEntries(a->A, &sis));
356   PetscCall(MatFindNonzeroRows(a->B, &gis));
357   PetscCall(ISGetSize(gis, &ngis));
358   PetscCall(ISGetSize(sis, &nsis));
359   PetscCall(ISGetIndices(sis, &isis));
360   PetscCall(ISGetIndices(gis, &igis));
361 
362   PetscCall(PetscMalloc1(ngis + nsis, &iis));
363   PetscCall(PetscArraycpy(iis, igis, ngis));
364   PetscCall(PetscArraycpy(iis + ngis, isis, nsis));
365   n = ngis + nsis;
366   PetscCall(PetscSortRemoveDupsInt(&n, iis));
367   PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
368   for (i = 0; i < n; i++) iis[i] += rstart;
369   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)A), n, iis, PETSC_OWN_POINTER, is));
370 
371   PetscCall(ISRestoreIndices(sis, &isis));
372   PetscCall(ISRestoreIndices(gis, &igis));
373   PetscCall(ISDestroy(&sis));
374   PetscCall(ISDestroy(&gis));
375   PetscFunctionReturn(PETSC_SUCCESS);
376 }
377 
378 /*
379   Local utility routine that creates a mapping from the global column
380 number to the local number in the off-diagonal part of the local
381 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
382 a slightly higher hash table cost; without it it is not scalable (each processor
383 has an order N integer array but is fast to access.
384 */
385 PetscErrorCode MatCreateColmap_MPIAIJ_Private(Mat mat)
386 {
387   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
388   PetscInt    n   = aij->B->cmap->n, i;
389 
390   PetscFunctionBegin;
391   PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
392 #if defined(PETSC_USE_CTABLE)
393   PetscCall(PetscHMapICreateWithSize(n, &aij->colmap));
394   for (i = 0; i < n; i++) PetscCall(PetscHMapISet(aij->colmap, aij->garray[i] + 1, i + 1));
395 #else
396   PetscCall(PetscCalloc1(mat->cmap->N + 1, &aij->colmap));
397   for (i = 0; i < n; i++) aij->colmap[aij->garray[i]] = i + 1;
398 #endif
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv, orow, ocol) \
403   do { \
404     if (col <= lastcol1) low1 = 0; \
405     else high1 = nrow1; \
406     lastcol1 = col; \
407     while (high1 - low1 > 5) { \
408       t = (low1 + high1) / 2; \
409       if (rp1[t] > col) high1 = t; \
410       else low1 = t; \
411     } \
412     for (_i = low1; _i < high1; _i++) { \
413       if (rp1[_i] > col) break; \
414       if (rp1[_i] == col) { \
415         if (addv == ADD_VALUES) { \
416           ap1[_i] += value; \
417           /* Not sure LogFlops will slow down the code or not */ \
418           (void)PetscLogFlops(1.0); \
419         } else ap1[_i] = value; \
420         goto a_noinsert; \
421       } \
422     } \
423     if (value == 0.0 && ignorezeroentries && row != col) { \
424       low1  = 0; \
425       high1 = nrow1; \
426       goto a_noinsert; \
427     } \
428     if (nonew == 1) { \
429       low1  = 0; \
430       high1 = nrow1; \
431       goto a_noinsert; \
432     } \
433     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
434     MatSeqXAIJReallocateAIJ(A, am, 1, nrow1, row, col, rmax1, aa, ai, aj, rp1, ap1, aimax, nonew, MatScalar); \
435     N = nrow1++ - 1; \
436     a->nz++; \
437     high1++; \
438     /* shift up all the later entries in this row */ \
439     PetscCall(PetscArraymove(rp1 + _i + 1, rp1 + _i, N - _i + 1)); \
440     PetscCall(PetscArraymove(ap1 + _i + 1, ap1 + _i, N - _i + 1)); \
441     rp1[_i] = col; \
442     ap1[_i] = value; \
443   a_noinsert:; \
444     ailen[row] = nrow1; \
445   } while (0)
446 
447 #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv, orow, ocol) \
448   do { \
449     if (col <= lastcol2) low2 = 0; \
450     else high2 = nrow2; \
451     lastcol2 = col; \
452     while (high2 - low2 > 5) { \
453       t = (low2 + high2) / 2; \
454       if (rp2[t] > col) high2 = t; \
455       else low2 = t; \
456     } \
457     for (_i = low2; _i < high2; _i++) { \
458       if (rp2[_i] > col) break; \
459       if (rp2[_i] == col) { \
460         if (addv == ADD_VALUES) { \
461           ap2[_i] += value; \
462           (void)PetscLogFlops(1.0); \
463         } else ap2[_i] = value; \
464         goto b_noinsert; \
465       } \
466     } \
467     if (value == 0.0 && ignorezeroentries) { \
468       low2  = 0; \
469       high2 = nrow2; \
470       goto b_noinsert; \
471     } \
472     if (nonew == 1) { \
473       low2  = 0; \
474       high2 = nrow2; \
475       goto b_noinsert; \
476     } \
477     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
478     MatSeqXAIJReallocateAIJ(B, bm, 1, nrow2, row, col, rmax2, ba, bi, bj, rp2, ap2, bimax, nonew, MatScalar); \
479     N = nrow2++ - 1; \
480     b->nz++; \
481     high2++; \
482     /* shift up all the later entries in this row */ \
483     PetscCall(PetscArraymove(rp2 + _i + 1, rp2 + _i, N - _i + 1)); \
484     PetscCall(PetscArraymove(ap2 + _i + 1, ap2 + _i, N - _i + 1)); \
485     rp2[_i] = col; \
486     ap2[_i] = value; \
487   b_noinsert:; \
488     bilen[row] = nrow2; \
489   } while (0)
490 
491 static PetscErrorCode MatSetValuesRow_MPIAIJ(Mat A, PetscInt row, const PetscScalar v[])
492 {
493   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)A->data;
494   Mat_SeqAIJ  *a = (Mat_SeqAIJ *)mat->A->data, *b = (Mat_SeqAIJ *)mat->B->data;
495   PetscInt     l, *garray                         = mat->garray, diag;
496   PetscScalar *aa, *ba;
497 
498   PetscFunctionBegin;
499   /* code only works for square matrices A */
500 
501   /* find size of row to the left of the diagonal part */
502   PetscCall(MatGetOwnershipRange(A, &diag, NULL));
503   row = row - diag;
504   for (l = 0; l < b->i[row + 1] - b->i[row]; l++) {
505     if (garray[b->j[b->i[row] + l]] > diag) break;
506   }
507   if (l) {
508     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
509     PetscCall(PetscArraycpy(ba + b->i[row], v, l));
510     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
511   }
512 
513   /* diagonal part */
514   if (a->i[row + 1] - a->i[row]) {
515     PetscCall(MatSeqAIJGetArray(mat->A, &aa));
516     PetscCall(PetscArraycpy(aa + a->i[row], v + l, a->i[row + 1] - a->i[row]));
517     PetscCall(MatSeqAIJRestoreArray(mat->A, &aa));
518   }
519 
520   /* right of diagonal part */
521   if (b->i[row + 1] - b->i[row] - l) {
522     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
523     PetscCall(PetscArraycpy(ba + b->i[row] + l, v + l + a->i[row + 1] - a->i[row], b->i[row + 1] - b->i[row] - l));
524     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
525   }
526   PetscFunctionReturn(PETSC_SUCCESS);
527 }
528 
529 PetscErrorCode MatSetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode addv)
530 {
531   Mat_MPIAIJ *aij   = (Mat_MPIAIJ *)mat->data;
532   PetscScalar value = 0.0;
533   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
534   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
535   PetscBool   roworiented = aij->roworiented;
536 
537   /* Some Variables required in the macro */
538   Mat         A     = aij->A;
539   Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
540   PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
541   PetscBool   ignorezeroentries = a->ignorezeroentries;
542   Mat         B                 = aij->B;
543   Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
544   PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
545   MatScalar  *aa, *ba;
546   PetscInt   *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
547   PetscInt    nonew;
548   MatScalar  *ap1, *ap2;
549 
550   PetscFunctionBegin;
551   PetscCall(MatSeqAIJGetArray(A, &aa));
552   PetscCall(MatSeqAIJGetArray(B, &ba));
553   for (i = 0; i < m; i++) {
554     if (im[i] < 0) continue;
555     PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
556     if (im[i] >= rstart && im[i] < rend) {
557       row      = im[i] - rstart;
558       lastcol1 = -1;
559       rp1      = PetscSafePointerPlusOffset(aj, ai[row]);
560       ap1      = PetscSafePointerPlusOffset(aa, ai[row]);
561       rmax1    = aimax[row];
562       nrow1    = ailen[row];
563       low1     = 0;
564       high1    = nrow1;
565       lastcol2 = -1;
566       rp2      = PetscSafePointerPlusOffset(bj, bi[row]);
567       ap2      = PetscSafePointerPlusOffset(ba, bi[row]);
568       rmax2    = bimax[row];
569       nrow2    = bilen[row];
570       low2     = 0;
571       high2    = nrow2;
572 
573       for (j = 0; j < n; j++) {
574         if (v) value = roworiented ? v[i * n + j] : v[i + j * m];
575         if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
576         if (in[j] >= cstart && in[j] < cend) {
577           col   = in[j] - cstart;
578           nonew = a->nonew;
579           MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
580         } else if (in[j] < 0) {
581           continue;
582         } else {
583           PetscCheck(in[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
584           if (mat->was_assembled) {
585             if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
586 #if defined(PETSC_USE_CTABLE)
587             PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col)); /* map global col ids to local ones */
588             col--;
589 #else
590             col = aij->colmap[in[j]] - 1;
591 #endif
592             if (col < 0 && !((Mat_SeqAIJ *)aij->B->data)->nonew) { /* col < 0 means in[j] is a new col for B */
593               PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));  /* Change aij->B from reduced/local format to expanded/global format */
594               col = in[j];
595               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
596               B     = aij->B;
597               b     = (Mat_SeqAIJ *)B->data;
598               bimax = b->imax;
599               bi    = b->i;
600               bilen = b->ilen;
601               bj    = b->j;
602               ba    = b->a;
603               rp2   = PetscSafePointerPlusOffset(bj, bi[row]);
604               ap2   = PetscSafePointerPlusOffset(ba, bi[row]);
605               rmax2 = bimax[row];
606               nrow2 = bilen[row];
607               low2  = 0;
608               high2 = nrow2;
609               bm    = aij->B->rmap->n;
610               ba    = b->a;
611             } else if (col < 0 && !(ignorezeroentries && value == 0.0)) {
612               PetscCheck(1 == ((Mat_SeqAIJ *)aij->B->data)->nonew, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", im[i], in[j]);
613               PetscCall(PetscInfo(mat, "Skipping of insertion of new nonzero location in off-diagonal portion of matrix %g(%" PetscInt_FMT ",%" PetscInt_FMT ")\n", (double)PetscRealPart(value), im[i], in[j]));
614             }
615           } else col = in[j];
616           nonew = b->nonew;
617           MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
618         }
619       }
620     } else {
621       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Setting off process row %" PetscInt_FMT " even though MatSetOption(,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) was set", im[i]);
622       if (!aij->donotstash) {
623         mat->assembled = PETSC_FALSE;
624         if (roworiented) {
625           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i * n), (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
626         } else {
627           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i), m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
628         }
629       }
630     }
631   }
632   PetscCall(MatSeqAIJRestoreArray(A, &aa)); /* aa, bb might have been free'd due to reallocation above. But we don't access them here */
633   PetscCall(MatSeqAIJRestoreArray(B, &ba));
634   PetscFunctionReturn(PETSC_SUCCESS);
635 }
636 
637 /*
638     This function sets the j and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
639     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
640     No off-processor parts off the matrix are allowed here and mat->was_assembled has to be PETSC_FALSE.
641 */
642 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[])
643 {
644   Mat_MPIAIJ *aij    = (Mat_MPIAIJ *)mat->data;
645   Mat         A      = aij->A; /* diagonal part of the matrix */
646   Mat         B      = aij->B; /* off-diagonal part of the matrix */
647   Mat_SeqAIJ *a      = (Mat_SeqAIJ *)A->data;
648   Mat_SeqAIJ *b      = (Mat_SeqAIJ *)B->data;
649   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, col;
650   PetscInt   *ailen = a->ilen, *aj = a->j;
651   PetscInt   *bilen = b->ilen, *bj = b->j;
652   PetscInt    am          = aij->A->rmap->n, j;
653   PetscInt    diag_so_far = 0, dnz;
654   PetscInt    offd_so_far = 0, onz;
655 
656   PetscFunctionBegin;
657   /* Iterate over all rows of the matrix */
658   for (j = 0; j < am; j++) {
659     dnz = onz = 0;
660     /*  Iterate over all non-zero columns of the current row */
661     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
662       /* If column is in the diagonal */
663       if (mat_j[col] >= cstart && mat_j[col] < cend) {
664         aj[diag_so_far++] = mat_j[col] - cstart;
665         dnz++;
666       } else { /* off-diagonal entries */
667         bj[offd_so_far++] = mat_j[col];
668         onz++;
669       }
670     }
671     ailen[j] = dnz;
672     bilen[j] = onz;
673   }
674   PetscFunctionReturn(PETSC_SUCCESS);
675 }
676 
677 /*
678     This function sets the local j, a and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
679     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
680     No off-processor parts off the matrix are allowed here, they are set at a later point by MatSetValues_MPIAIJ.
681     Also, mat->was_assembled has to be false, otherwise the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart;
682     would not be true and the more complex MatSetValues_MPIAIJ has to be used.
683 */
684 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[], const PetscScalar mat_a[])
685 {
686   Mat_MPIAIJ  *aij  = (Mat_MPIAIJ *)mat->data;
687   Mat          A    = aij->A; /* diagonal part of the matrix */
688   Mat          B    = aij->B; /* off-diagonal part of the matrix */
689   Mat_SeqAIJ  *aijd = (Mat_SeqAIJ *)aij->A->data, *aijo = (Mat_SeqAIJ *)aij->B->data;
690   Mat_SeqAIJ  *a      = (Mat_SeqAIJ *)A->data;
691   Mat_SeqAIJ  *b      = (Mat_SeqAIJ *)B->data;
692   PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend;
693   PetscInt    *ailen = a->ilen, *aj = a->j;
694   PetscInt    *bilen = b->ilen, *bj = b->j;
695   PetscInt     am          = aij->A->rmap->n, j;
696   PetscInt    *full_diag_i = aijd->i, *full_offd_i = aijo->i; /* These variables can also include non-local elements, which are set at a later point. */
697   PetscInt     col, dnz_row, onz_row, rowstart_diag, rowstart_offd;
698   PetscScalar *aa = a->a, *ba = b->a;
699 
700   PetscFunctionBegin;
701   /* Iterate over all rows of the matrix */
702   for (j = 0; j < am; j++) {
703     dnz_row = onz_row = 0;
704     rowstart_offd     = full_offd_i[j];
705     rowstart_diag     = full_diag_i[j];
706     /*  Iterate over all non-zero columns of the current row */
707     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
708       /* If column is in the diagonal */
709       if (mat_j[col] >= cstart && mat_j[col] < cend) {
710         aj[rowstart_diag + dnz_row] = mat_j[col] - cstart;
711         aa[rowstart_diag + dnz_row] = mat_a[col];
712         dnz_row++;
713       } else { /* off-diagonal entries */
714         bj[rowstart_offd + onz_row] = mat_j[col];
715         ba[rowstart_offd + onz_row] = mat_a[col];
716         onz_row++;
717       }
718     }
719     ailen[j] = dnz_row;
720     bilen[j] = onz_row;
721   }
722   PetscFunctionReturn(PETSC_SUCCESS);
723 }
724 
725 static PetscErrorCode MatGetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
726 {
727   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
728   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
729   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
730 
731   PetscFunctionBegin;
732   for (i = 0; i < m; i++) {
733     if (idxm[i] < 0) continue; /* negative row */
734     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, idxm[i], mat->rmap->N - 1);
735     PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported, row requested %" PetscInt_FMT " range [%" PetscInt_FMT " %" PetscInt_FMT ")", idxm[i], rstart, rend);
736     row = idxm[i] - rstart;
737     for (j = 0; j < n; j++) {
738       if (idxn[j] < 0) continue; /* negative column */
739       PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, idxn[j], mat->cmap->N - 1);
740       if (idxn[j] >= cstart && idxn[j] < cend) {
741         col = idxn[j] - cstart;
742         PetscCall(MatGetValues(aij->A, 1, &row, 1, &col, v + i * n + j));
743       } else {
744         if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
745 #if defined(PETSC_USE_CTABLE)
746         PetscCall(PetscHMapIGetWithDefault(aij->colmap, idxn[j] + 1, 0, &col));
747         col--;
748 #else
749         col = aij->colmap[idxn[j]] - 1;
750 #endif
751         if ((col < 0) || (aij->garray[col] != idxn[j])) *(v + i * n + j) = 0.0;
752         else PetscCall(MatGetValues(aij->B, 1, &row, 1, &col, v + i * n + j));
753       }
754     }
755   }
756   PetscFunctionReturn(PETSC_SUCCESS);
757 }
758 
759 static PetscErrorCode MatAssemblyBegin_MPIAIJ(Mat mat, MatAssemblyType mode)
760 {
761   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
762   PetscInt    nstash, reallocs;
763 
764   PetscFunctionBegin;
765   if (aij->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
766 
767   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
768   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
769   PetscCall(PetscInfo(mat, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
770   PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 
773 PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat mat, MatAssemblyType mode)
774 {
775   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *)mat->data;
776   PetscMPIInt  n;
777   PetscInt     i, j, rstart, ncols, flg;
778   PetscInt    *row, *col;
779   PetscBool    all_assembled;
780   PetscScalar *val;
781 
782   /* do not use 'b = (Mat_SeqAIJ*)aij->B->data' as B can be reset in disassembly */
783 
784   PetscFunctionBegin;
785   if (!aij->donotstash && !mat->nooffprocentries) {
786     while (1) {
787       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
788       if (!flg) break;
789 
790       for (i = 0; i < n;) {
791         /* Now identify the consecutive vals belonging to the same row */
792         for (j = i, rstart = row[j]; j < n; j++) {
793           if (row[j] != rstart) break;
794         }
795         if (j < n) ncols = j - i;
796         else ncols = n - i;
797         /* Now assemble all these values with a single function call */
798         PetscCall(MatSetValues_MPIAIJ(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
799         i = j;
800       }
801     }
802     PetscCall(MatStashScatterEnd_Private(&mat->stash));
803   }
804 #if defined(PETSC_HAVE_DEVICE)
805   if (mat->offloadmask == PETSC_OFFLOAD_CPU) aij->A->offloadmask = PETSC_OFFLOAD_CPU;
806   /* We call MatBindToCPU() on aij->A and aij->B here, because if MatBindToCPU_MPIAIJ() is called before assembly, it cannot bind these. */
807   if (mat->boundtocpu) {
808     PetscCall(MatBindToCPU(aij->A, PETSC_TRUE));
809     PetscCall(MatBindToCPU(aij->B, PETSC_TRUE));
810   }
811 #endif
812   PetscCall(MatAssemblyBegin(aij->A, mode));
813   PetscCall(MatAssemblyEnd(aij->A, mode));
814 
815   /* determine if any process has disassembled, if so we must
816      also disassemble ourself, in order that we may reassemble. */
817   /*
818      if nonzero structure of submatrix B cannot change then we know that
819      no process disassembled thus we can skip this stuff
820   */
821   if (!((Mat_SeqAIJ *)aij->B->data)->nonew) {
822     PetscCallMPI(MPIU_Allreduce(&mat->was_assembled, &all_assembled, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
823     if (mat->was_assembled && !all_assembled) { /* mat on this rank has reduced off-diag B with local col ids, but globally it does not */
824       PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
825     }
826   }
827   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(MatSetUpMultiply_MPIAIJ(mat));
828   PetscCall(MatSetOption(aij->B, MAT_USE_INODES, PETSC_FALSE));
829 #if defined(PETSC_HAVE_DEVICE)
830   if (mat->offloadmask == PETSC_OFFLOAD_CPU && aij->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) aij->B->offloadmask = PETSC_OFFLOAD_CPU;
831 #endif
832   PetscCall(MatAssemblyBegin(aij->B, mode));
833   PetscCall(MatAssemblyEnd(aij->B, mode));
834 
835   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
836 
837   aij->rowvalues = NULL;
838 
839   PetscCall(VecDestroy(&aij->diag));
840 
841   /* if no new nonzero locations are allowed in matrix then only set the matrix state the first time through */
842   if ((!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) || !((Mat_SeqAIJ *)aij->A->data)->nonew) {
843     PetscObjectState state = aij->A->nonzerostate + aij->B->nonzerostate;
844     PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
845   }
846 #if defined(PETSC_HAVE_DEVICE)
847   mat->offloadmask = PETSC_OFFLOAD_BOTH;
848 #endif
849   PetscFunctionReturn(PETSC_SUCCESS);
850 }
851 
852 static PetscErrorCode MatZeroEntries_MPIAIJ(Mat A)
853 {
854   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
855 
856   PetscFunctionBegin;
857   PetscCall(MatZeroEntries(l->A));
858   PetscCall(MatZeroEntries(l->B));
859   PetscFunctionReturn(PETSC_SUCCESS);
860 }
861 
862 static PetscErrorCode MatZeroRows_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
863 {
864   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)A->data;
865   PetscInt   *lrows;
866   PetscInt    r, len;
867   PetscBool   cong;
868 
869   PetscFunctionBegin;
870   /* get locally owned rows */
871   PetscCall(MatZeroRowsMapLocal_Private(A, N, rows, &len, &lrows));
872   PetscCall(MatHasCongruentLayouts(A, &cong));
873   /* fix right-hand side if needed */
874   if (x && b) {
875     const PetscScalar *xx;
876     PetscScalar       *bb;
877 
878     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
879     PetscCall(VecGetArrayRead(x, &xx));
880     PetscCall(VecGetArray(b, &bb));
881     for (r = 0; r < len; ++r) bb[lrows[r]] = diag * xx[lrows[r]];
882     PetscCall(VecRestoreArrayRead(x, &xx));
883     PetscCall(VecRestoreArray(b, &bb));
884   }
885 
886   if (diag != 0.0 && cong) {
887     PetscCall(MatZeroRows(mat->A, len, lrows, diag, NULL, NULL));
888     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
889   } else if (diag != 0.0) { /* non-square or non congruent layouts -> if keepnonzeropattern is false, we allow for new insertion */
890     Mat_SeqAIJ *aijA = (Mat_SeqAIJ *)mat->A->data;
891     Mat_SeqAIJ *aijB = (Mat_SeqAIJ *)mat->B->data;
892     PetscInt    nnwA, nnwB;
893     PetscBool   nnzA, nnzB;
894 
895     nnwA = aijA->nonew;
896     nnwB = aijB->nonew;
897     nnzA = aijA->keepnonzeropattern;
898     nnzB = aijB->keepnonzeropattern;
899     if (!nnzA) {
900       PetscCall(PetscInfo(mat->A, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on diagonal block.\n"));
901       aijA->nonew = 0;
902     }
903     if (!nnzB) {
904       PetscCall(PetscInfo(mat->B, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on off-diagonal block.\n"));
905       aijB->nonew = 0;
906     }
907     /* Must zero here before the next loop */
908     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
909     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
910     for (r = 0; r < len; ++r) {
911       const PetscInt row = lrows[r] + A->rmap->rstart;
912       if (row >= A->cmap->N) continue;
913       PetscCall(MatSetValues(A, 1, &row, 1, &row, &diag, INSERT_VALUES));
914     }
915     aijA->nonew = nnwA;
916     aijB->nonew = nnwB;
917   } else {
918     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
919     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
920   }
921   PetscCall(PetscFree(lrows));
922   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
923   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
924 
925   /* only change matrix nonzero state if pattern was allowed to be changed */
926   if (!((Mat_SeqAIJ *)mat->A->data)->keepnonzeropattern || !((Mat_SeqAIJ *)mat->A->data)->nonew) {
927     PetscObjectState state = mat->A->nonzerostate + mat->B->nonzerostate;
928     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
929   }
930   PetscFunctionReturn(PETSC_SUCCESS);
931 }
932 
933 static PetscErrorCode MatZeroRowsColumns_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
934 {
935   Mat_MPIAIJ        *l = (Mat_MPIAIJ *)A->data;
936   PetscInt           n = A->rmap->n;
937   PetscInt           i, j, r, m, len = 0;
938   PetscInt          *lrows, *owners = A->rmap->range;
939   PetscMPIInt        p = 0;
940   PetscSFNode       *rrows;
941   PetscSF            sf;
942   const PetscScalar *xx;
943   PetscScalar       *bb, *mask, *aij_a;
944   Vec                xmask, lmask;
945   Mat_SeqAIJ        *aij = (Mat_SeqAIJ *)l->B->data;
946   const PetscInt    *aj, *ii, *ridx;
947   PetscScalar       *aa;
948 
949   PetscFunctionBegin;
950   /* Create SF where leaves are input rows and roots are owned rows */
951   PetscCall(PetscMalloc1(n, &lrows));
952   for (r = 0; r < n; ++r) lrows[r] = -1;
953   PetscCall(PetscMalloc1(N, &rrows));
954   for (r = 0; r < N; ++r) {
955     const PetscInt idx = rows[r];
956     PetscCheck(idx >= 0 && A->rmap->N > idx, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range [0,%" PetscInt_FMT ")", idx, A->rmap->N);
957     if (idx < owners[p] || owners[p + 1] <= idx) { /* short-circuit the search if the last p owns this row too */
958       PetscCall(PetscLayoutFindOwner(A->rmap, idx, &p));
959     }
960     rrows[r].rank  = p;
961     rrows[r].index = rows[r] - owners[p];
962   }
963   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
964   PetscCall(PetscSFSetGraph(sf, n, N, NULL, PETSC_OWN_POINTER, rrows, PETSC_OWN_POINTER));
965   /* Collect flags for rows to be zeroed */
966   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
967   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
968   PetscCall(PetscSFDestroy(&sf));
969   /* Compress and put in row numbers */
970   for (r = 0; r < n; ++r)
971     if (lrows[r] >= 0) lrows[len++] = r;
972   /* zero diagonal part of matrix */
973   PetscCall(MatZeroRowsColumns(l->A, len, lrows, diag, x, b));
974   /* handle off-diagonal part of matrix */
975   PetscCall(MatCreateVecs(A, &xmask, NULL));
976   PetscCall(VecDuplicate(l->lvec, &lmask));
977   PetscCall(VecGetArray(xmask, &bb));
978   for (i = 0; i < len; i++) bb[lrows[i]] = 1;
979   PetscCall(VecRestoreArray(xmask, &bb));
980   PetscCall(VecScatterBegin(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
981   PetscCall(VecScatterEnd(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
982   PetscCall(VecDestroy(&xmask));
983   if (x && b) { /* this code is buggy when the row and column layout don't match */
984     PetscBool cong;
985 
986     PetscCall(MatHasCongruentLayouts(A, &cong));
987     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
988     PetscCall(VecScatterBegin(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
989     PetscCall(VecScatterEnd(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
990     PetscCall(VecGetArrayRead(l->lvec, &xx));
991     PetscCall(VecGetArray(b, &bb));
992   }
993   PetscCall(VecGetArray(lmask, &mask));
994   /* remove zeroed rows of off-diagonal matrix */
995   PetscCall(MatSeqAIJGetArray(l->B, &aij_a));
996   ii = aij->i;
997   for (i = 0; i < len; i++) PetscCall(PetscArrayzero(PetscSafePointerPlusOffset(aij_a, ii[lrows[i]]), ii[lrows[i] + 1] - ii[lrows[i]]));
998   /* loop over all elements of off process part of matrix zeroing removed columns*/
999   if (aij->compressedrow.use) {
1000     m    = aij->compressedrow.nrows;
1001     ii   = aij->compressedrow.i;
1002     ridx = aij->compressedrow.rindex;
1003     for (i = 0; i < m; i++) {
1004       n  = ii[i + 1] - ii[i];
1005       aj = aij->j + ii[i];
1006       aa = aij_a + ii[i];
1007 
1008       for (j = 0; j < n; j++) {
1009         if (PetscAbsScalar(mask[*aj])) {
1010           if (b) bb[*ridx] -= *aa * xx[*aj];
1011           *aa = 0.0;
1012         }
1013         aa++;
1014         aj++;
1015       }
1016       ridx++;
1017     }
1018   } else { /* do not use compressed row format */
1019     m = l->B->rmap->n;
1020     for (i = 0; i < m; i++) {
1021       n  = ii[i + 1] - ii[i];
1022       aj = aij->j + ii[i];
1023       aa = aij_a + ii[i];
1024       for (j = 0; j < n; j++) {
1025         if (PetscAbsScalar(mask[*aj])) {
1026           if (b) bb[i] -= *aa * xx[*aj];
1027           *aa = 0.0;
1028         }
1029         aa++;
1030         aj++;
1031       }
1032     }
1033   }
1034   if (x && b) {
1035     PetscCall(VecRestoreArray(b, &bb));
1036     PetscCall(VecRestoreArrayRead(l->lvec, &xx));
1037   }
1038   PetscCall(MatSeqAIJRestoreArray(l->B, &aij_a));
1039   PetscCall(VecRestoreArray(lmask, &mask));
1040   PetscCall(VecDestroy(&lmask));
1041   PetscCall(PetscFree(lrows));
1042 
1043   /* only change matrix nonzero state if pattern was allowed to be changed */
1044   if (!((Mat_SeqAIJ *)l->A->data)->nonew) {
1045     PetscObjectState state = l->A->nonzerostate + l->B->nonzerostate;
1046     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
1047   }
1048   PetscFunctionReturn(PETSC_SUCCESS);
1049 }
1050 
1051 static PetscErrorCode MatMult_MPIAIJ(Mat A, Vec xx, Vec yy)
1052 {
1053   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1054   PetscInt    nt;
1055   VecScatter  Mvctx = a->Mvctx;
1056 
1057   PetscFunctionBegin;
1058   PetscCall(VecGetLocalSize(xx, &nt));
1059   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
1060   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1061   PetscUseTypeMethod(a->A, mult, xx, yy);
1062   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1063   PetscUseTypeMethod(a->B, multadd, a->lvec, yy, yy);
1064   PetscFunctionReturn(PETSC_SUCCESS);
1065 }
1066 
1067 static PetscErrorCode MatMultDiagonalBlock_MPIAIJ(Mat A, Vec bb, Vec xx)
1068 {
1069   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1070 
1071   PetscFunctionBegin;
1072   PetscCall(MatMultDiagonalBlock(a->A, bb, xx));
1073   PetscFunctionReturn(PETSC_SUCCESS);
1074 }
1075 
1076 static PetscErrorCode MatMultAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1077 {
1078   Mat_MPIAIJ *a     = (Mat_MPIAIJ *)A->data;
1079   VecScatter  Mvctx = a->Mvctx;
1080 
1081   PetscFunctionBegin;
1082   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1083   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
1084   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1085   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
1086   PetscFunctionReturn(PETSC_SUCCESS);
1087 }
1088 
1089 static PetscErrorCode MatMultTranspose_MPIAIJ(Mat A, Vec xx, Vec yy)
1090 {
1091   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1092 
1093   PetscFunctionBegin;
1094   /* do nondiagonal part */
1095   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1096   /* do local part */
1097   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
1098   /* add partial results together */
1099   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1100   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1101   PetscFunctionReturn(PETSC_SUCCESS);
1102 }
1103 
1104 static PetscErrorCode MatIsTranspose_MPIAIJ(Mat Amat, Mat Bmat, PetscReal tol, PetscBool *f)
1105 {
1106   MPI_Comm    comm;
1107   Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)Amat->data, *Bij = (Mat_MPIAIJ *)Bmat->data;
1108   Mat         Adia = Aij->A, Bdia = Bij->A, Aoff, Boff, *Aoffs, *Boffs;
1109   IS          Me, Notme;
1110   PetscInt    M, N, first, last, *notme, i;
1111   PetscBool   lf;
1112   PetscMPIInt size;
1113 
1114   PetscFunctionBegin;
1115   /* Easy test: symmetric diagonal block */
1116   PetscCall(MatIsTranspose(Adia, Bdia, tol, &lf));
1117   PetscCallMPI(MPIU_Allreduce(&lf, f, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)Amat)));
1118   if (!*f) PetscFunctionReturn(PETSC_SUCCESS);
1119   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
1120   PetscCallMPI(MPI_Comm_size(comm, &size));
1121   if (size == 1) PetscFunctionReturn(PETSC_SUCCESS);
1122 
1123   /* Hard test: off-diagonal block. This takes a MatCreateSubMatrix. */
1124   PetscCall(MatGetSize(Amat, &M, &N));
1125   PetscCall(MatGetOwnershipRange(Amat, &first, &last));
1126   PetscCall(PetscMalloc1(N - last + first, &notme));
1127   for (i = 0; i < first; i++) notme[i] = i;
1128   for (i = last; i < M; i++) notme[i - last + first] = i;
1129   PetscCall(ISCreateGeneral(MPI_COMM_SELF, N - last + first, notme, PETSC_COPY_VALUES, &Notme));
1130   PetscCall(ISCreateStride(MPI_COMM_SELF, last - first, first, 1, &Me));
1131   PetscCall(MatCreateSubMatrices(Amat, 1, &Me, &Notme, MAT_INITIAL_MATRIX, &Aoffs));
1132   Aoff = Aoffs[0];
1133   PetscCall(MatCreateSubMatrices(Bmat, 1, &Notme, &Me, MAT_INITIAL_MATRIX, &Boffs));
1134   Boff = Boffs[0];
1135   PetscCall(MatIsTranspose(Aoff, Boff, tol, f));
1136   PetscCall(MatDestroyMatrices(1, &Aoffs));
1137   PetscCall(MatDestroyMatrices(1, &Boffs));
1138   PetscCall(ISDestroy(&Me));
1139   PetscCall(ISDestroy(&Notme));
1140   PetscCall(PetscFree(notme));
1141   PetscFunctionReturn(PETSC_SUCCESS);
1142 }
1143 
1144 static PetscErrorCode MatMultTransposeAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1145 {
1146   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1147 
1148   PetscFunctionBegin;
1149   /* do nondiagonal part */
1150   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1151   /* do local part */
1152   PetscCall((*a->A->ops->multtransposeadd)(a->A, xx, yy, zz));
1153   /* add partial results together */
1154   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1155   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1156   PetscFunctionReturn(PETSC_SUCCESS);
1157 }
1158 
1159 /*
1160   This only works correctly for square matrices where the subblock A->A is the
1161    diagonal block
1162 */
1163 static PetscErrorCode MatGetDiagonal_MPIAIJ(Mat A, Vec v)
1164 {
1165   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1166 
1167   PetscFunctionBegin;
1168   PetscCheck(A->rmap->N == A->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Supports only square matrix where A->A is diag block");
1169   PetscCheck(A->rmap->rstart == A->cmap->rstart && A->rmap->rend == A->cmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "row partition must equal col partition");
1170   PetscCall(MatGetDiagonal(a->A, v));
1171   PetscFunctionReturn(PETSC_SUCCESS);
1172 }
1173 
1174 static PetscErrorCode MatScale_MPIAIJ(Mat A, PetscScalar aa)
1175 {
1176   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1177 
1178   PetscFunctionBegin;
1179   PetscCall(MatScale(a->A, aa));
1180   PetscCall(MatScale(a->B, aa));
1181   PetscFunctionReturn(PETSC_SUCCESS);
1182 }
1183 
1184 static PetscErrorCode MatView_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
1185 {
1186   Mat_MPIAIJ        *aij    = (Mat_MPIAIJ *)mat->data;
1187   Mat_SeqAIJ        *A      = (Mat_SeqAIJ *)aij->A->data;
1188   Mat_SeqAIJ        *B      = (Mat_SeqAIJ *)aij->B->data;
1189   const PetscInt    *garray = aij->garray;
1190   const PetscScalar *aa, *ba;
1191   PetscInt           header[4], M, N, m, rs, cs, cnt, i, ja, jb;
1192   PetscInt64         nz, hnz;
1193   PetscInt          *rowlens;
1194   PetscInt          *colidxs;
1195   PetscScalar       *matvals;
1196   PetscMPIInt        rank;
1197 
1198   PetscFunctionBegin;
1199   PetscCall(PetscViewerSetUp(viewer));
1200 
1201   M  = mat->rmap->N;
1202   N  = mat->cmap->N;
1203   m  = mat->rmap->n;
1204   rs = mat->rmap->rstart;
1205   cs = mat->cmap->rstart;
1206   nz = A->nz + B->nz;
1207 
1208   /* write matrix header */
1209   header[0] = MAT_FILE_CLASSID;
1210   header[1] = M;
1211   header[2] = N;
1212   PetscCallMPI(MPI_Reduce(&nz, &hnz, 1, MPIU_INT64, MPI_SUM, 0, PetscObjectComm((PetscObject)mat)));
1213   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1214   if (rank == 0) PetscCall(PetscIntCast(hnz, &header[3]));
1215   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1216 
1217   /* fill in and store row lengths  */
1218   PetscCall(PetscMalloc1(m, &rowlens));
1219   for (i = 0; i < m; i++) rowlens[i] = A->i[i + 1] - A->i[i] + B->i[i + 1] - B->i[i];
1220   PetscCall(PetscViewerBinaryWriteAll(viewer, rowlens, m, rs, M, PETSC_INT));
1221   PetscCall(PetscFree(rowlens));
1222 
1223   /* fill in and store column indices */
1224   PetscCall(PetscMalloc1(nz, &colidxs));
1225   for (cnt = 0, i = 0; i < m; i++) {
1226     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1227       if (garray[B->j[jb]] > cs) break;
1228       colidxs[cnt++] = garray[B->j[jb]];
1229     }
1230     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) colidxs[cnt++] = A->j[ja] + cs;
1231     for (; jb < B->i[i + 1]; jb++) colidxs[cnt++] = garray[B->j[jb]];
1232   }
1233   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1234   PetscCall(PetscViewerBinaryWriteAll(viewer, colidxs, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
1235   PetscCall(PetscFree(colidxs));
1236 
1237   /* fill in and store nonzero values */
1238   PetscCall(MatSeqAIJGetArrayRead(aij->A, &aa));
1239   PetscCall(MatSeqAIJGetArrayRead(aij->B, &ba));
1240   PetscCall(PetscMalloc1(nz, &matvals));
1241   for (cnt = 0, i = 0; i < m; i++) {
1242     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1243       if (garray[B->j[jb]] > cs) break;
1244       matvals[cnt++] = ba[jb];
1245     }
1246     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) matvals[cnt++] = aa[ja];
1247     for (; jb < B->i[i + 1]; jb++) matvals[cnt++] = ba[jb];
1248   }
1249   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &aa));
1250   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &ba));
1251   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1252   PetscCall(PetscViewerBinaryWriteAll(viewer, matvals, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
1253   PetscCall(PetscFree(matvals));
1254 
1255   /* write block size option to the viewer's .info file */
1256   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1257   PetscFunctionReturn(PETSC_SUCCESS);
1258 }
1259 
1260 #include <petscdraw.h>
1261 static PetscErrorCode MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
1262 {
1263   Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)mat->data;
1264   PetscMPIInt       rank = aij->rank, size = aij->size;
1265   PetscBool         isdraw, isascii, isbinary;
1266   PetscViewer       sviewer;
1267   PetscViewerFormat format;
1268 
1269   PetscFunctionBegin;
1270   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1271   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
1272   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1273   if (isascii) {
1274     PetscCall(PetscViewerGetFormat(viewer, &format));
1275     if (format == PETSC_VIEWER_LOAD_BALANCE) {
1276       PetscInt i, nmax = 0, nmin = PETSC_INT_MAX, navg = 0, *nz, nzlocal = ((Mat_SeqAIJ *)aij->A->data)->nz + ((Mat_SeqAIJ *)aij->B->data)->nz;
1277       PetscCall(PetscMalloc1(size, &nz));
1278       PetscCallMPI(MPI_Allgather(&nzlocal, 1, MPIU_INT, nz, 1, MPIU_INT, PetscObjectComm((PetscObject)mat)));
1279       for (i = 0; i < size; i++) {
1280         nmax = PetscMax(nmax, nz[i]);
1281         nmin = PetscMin(nmin, nz[i]);
1282         navg += nz[i];
1283       }
1284       PetscCall(PetscFree(nz));
1285       navg = navg / size;
1286       PetscCall(PetscViewerASCIIPrintf(viewer, "Load Balance - Nonzeros: Min %" PetscInt_FMT "  avg %" PetscInt_FMT "  max %" PetscInt_FMT "\n", nmin, navg, nmax));
1287       PetscFunctionReturn(PETSC_SUCCESS);
1288     }
1289     PetscCall(PetscViewerGetFormat(viewer, &format));
1290     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1291       MatInfo   info;
1292       PetscInt *inodes = NULL;
1293 
1294       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1295       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
1296       PetscCall(MatInodeGetInodeSizes(aij->A, NULL, &inodes, NULL));
1297       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1298       if (!inodes) {
1299         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, not using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
1300                                                      info.memory));
1301       } else {
1302         PetscCall(
1303           PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated, info.memory));
1304       }
1305       PetscCall(MatGetInfo(aij->A, MAT_LOCAL, &info));
1306       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] on-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1307       PetscCall(MatGetInfo(aij->B, MAT_LOCAL, &info));
1308       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] off-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1309       PetscCall(PetscViewerFlush(viewer));
1310       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1311       PetscCall(PetscViewerASCIIPrintf(viewer, "Information on VecScatter used in matrix-vector product: \n"));
1312       PetscCall(VecScatterView(aij->Mvctx, viewer));
1313       PetscFunctionReturn(PETSC_SUCCESS);
1314     } else if (format == PETSC_VIEWER_ASCII_INFO) {
1315       PetscInt inodecount, inodelimit, *inodes;
1316       PetscCall(MatInodeGetInodeSizes(aij->A, &inodecount, &inodes, &inodelimit));
1317       if (inodes) {
1318         PetscCall(PetscViewerASCIIPrintf(viewer, "using I-node (on process 0) routines: found %" PetscInt_FMT " nodes, limit used is %" PetscInt_FMT "\n", inodecount, inodelimit));
1319       } else {
1320         PetscCall(PetscViewerASCIIPrintf(viewer, "not using I-node (on process 0) routines\n"));
1321       }
1322       PetscFunctionReturn(PETSC_SUCCESS);
1323     } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1324       PetscFunctionReturn(PETSC_SUCCESS);
1325     }
1326   } else if (isbinary) {
1327     if (size == 1) {
1328       PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1329       PetscCall(MatView(aij->A, viewer));
1330     } else {
1331       PetscCall(MatView_MPIAIJ_Binary(mat, viewer));
1332     }
1333     PetscFunctionReturn(PETSC_SUCCESS);
1334   } else if (isascii && size == 1) {
1335     PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1336     PetscCall(MatView(aij->A, viewer));
1337     PetscFunctionReturn(PETSC_SUCCESS);
1338   } else if (isdraw) {
1339     PetscDraw draw;
1340     PetscBool isnull;
1341     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1342     PetscCall(PetscDrawIsNull(draw, &isnull));
1343     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1344   }
1345 
1346   { /* assemble the entire matrix onto first processor */
1347     Mat A = NULL, Av;
1348     IS  isrow, iscol;
1349 
1350     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->rmap->N : 0, 0, 1, &isrow));
1351     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->cmap->N : 0, 0, 1, &iscol));
1352     PetscCall(MatCreateSubMatrix(mat, isrow, iscol, MAT_INITIAL_MATRIX, &A));
1353     PetscCall(MatMPIAIJGetSeqAIJ(A, &Av, NULL, NULL));
1354     /*  The commented code uses MatCreateSubMatrices instead */
1355     /*
1356     Mat *AA, A = NULL, Av;
1357     IS  isrow,iscol;
1358 
1359     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->rmap->N : 0,0,1,&isrow));
1360     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->cmap->N : 0,0,1,&iscol));
1361     PetscCall(MatCreateSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&AA));
1362     if (rank == 0) {
1363        PetscCall(PetscObjectReference((PetscObject)AA[0]));
1364        A    = AA[0];
1365        Av   = AA[0];
1366     }
1367     PetscCall(MatDestroySubMatrices(1,&AA));
1368 */
1369     PetscCall(ISDestroy(&iscol));
1370     PetscCall(ISDestroy(&isrow));
1371     /*
1372        Everyone has to call to draw the matrix since the graphics waits are
1373        synchronized across all processors that share the PetscDraw object
1374     */
1375     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1376     if (rank == 0) {
1377       if (((PetscObject)mat)->name) PetscCall(PetscObjectSetName((PetscObject)Av, ((PetscObject)mat)->name));
1378       PetscCall(MatView_SeqAIJ(Av, sviewer));
1379     }
1380     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1381     PetscCall(MatDestroy(&A));
1382   }
1383   PetscFunctionReturn(PETSC_SUCCESS);
1384 }
1385 
1386 PetscErrorCode MatView_MPIAIJ(Mat mat, PetscViewer viewer)
1387 {
1388   PetscBool isascii, isdraw, issocket, isbinary;
1389 
1390   PetscFunctionBegin;
1391   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
1392   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1393   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1394   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
1395   if (isascii || isdraw || isbinary || issocket) PetscCall(MatView_MPIAIJ_ASCIIorDraworSocket(mat, viewer));
1396   PetscFunctionReturn(PETSC_SUCCESS);
1397 }
1398 
1399 static PetscErrorCode MatSOR_MPIAIJ(Mat matin, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1400 {
1401   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)matin->data;
1402   Vec         bb1 = NULL;
1403   PetscBool   hasop;
1404 
1405   PetscFunctionBegin;
1406   if (flag == SOR_APPLY_UPPER) {
1407     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1408     PetscFunctionReturn(PETSC_SUCCESS);
1409   }
1410 
1411   if (its > 1 || ~flag & SOR_ZERO_INITIAL_GUESS || flag & SOR_EISENSTAT) PetscCall(VecDuplicate(bb, &bb1));
1412 
1413   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP) {
1414     if (flag & SOR_ZERO_INITIAL_GUESS) {
1415       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1416       its--;
1417     }
1418 
1419     while (its--) {
1420       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1421       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1422 
1423       /* update rhs: bb1 = bb - B*x */
1424       PetscCall(VecScale(mat->lvec, -1.0));
1425       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1426 
1427       /* local sweep */
1428       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_SYMMETRIC_SWEEP, fshift, lits, 1, xx));
1429     }
1430   } else if (flag & SOR_LOCAL_FORWARD_SWEEP) {
1431     if (flag & SOR_ZERO_INITIAL_GUESS) {
1432       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1433       its--;
1434     }
1435     while (its--) {
1436       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1437       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1438 
1439       /* update rhs: bb1 = bb - B*x */
1440       PetscCall(VecScale(mat->lvec, -1.0));
1441       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1442 
1443       /* local sweep */
1444       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_FORWARD_SWEEP, fshift, lits, 1, xx));
1445     }
1446   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP) {
1447     if (flag & SOR_ZERO_INITIAL_GUESS) {
1448       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1449       its--;
1450     }
1451     while (its--) {
1452       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1453       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1454 
1455       /* update rhs: bb1 = bb - B*x */
1456       PetscCall(VecScale(mat->lvec, -1.0));
1457       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1458 
1459       /* local sweep */
1460       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_BACKWARD_SWEEP, fshift, lits, 1, xx));
1461     }
1462   } else if (flag & SOR_EISENSTAT) {
1463     Vec xx1;
1464 
1465     PetscCall(VecDuplicate(bb, &xx1));
1466     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_BACKWARD_SWEEP), fshift, lits, 1, xx));
1467 
1468     PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1469     PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1470     if (!mat->diag) {
1471       PetscCall(MatCreateVecs(matin, &mat->diag, NULL));
1472       PetscCall(MatGetDiagonal(matin, mat->diag));
1473     }
1474     PetscCall(MatHasOperation(matin, MATOP_MULT_DIAGONAL_BLOCK, &hasop));
1475     if (hasop) {
1476       PetscCall(MatMultDiagonalBlock(matin, xx, bb1));
1477     } else {
1478       PetscCall(VecPointwiseMult(bb1, mat->diag, xx));
1479     }
1480     PetscCall(VecAYPX(bb1, (omega - 2.0) / omega, bb));
1481 
1482     PetscCall(MatMultAdd(mat->B, mat->lvec, bb1, bb1));
1483 
1484     /* local sweep */
1485     PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_FORWARD_SWEEP), fshift, lits, 1, xx1));
1486     PetscCall(VecAXPY(xx, 1.0, xx1));
1487     PetscCall(VecDestroy(&xx1));
1488   } else SETERRQ(PetscObjectComm((PetscObject)matin), PETSC_ERR_SUP, "Parallel SOR not supported");
1489 
1490   PetscCall(VecDestroy(&bb1));
1491 
1492   matin->factorerrortype = mat->A->factorerrortype;
1493   PetscFunctionReturn(PETSC_SUCCESS);
1494 }
1495 
1496 static PetscErrorCode MatPermute_MPIAIJ(Mat A, IS rowp, IS colp, Mat *B)
1497 {
1498   Mat             aA, aB, Aperm;
1499   const PetscInt *rwant, *cwant, *gcols, *ai, *bi, *aj, *bj;
1500   PetscScalar    *aa, *ba;
1501   PetscInt        i, j, m, n, ng, anz, bnz, *dnnz, *onnz, *tdnnz, *tonnz, *rdest, *cdest, *work, *gcdest;
1502   PetscSF         rowsf, sf;
1503   IS              parcolp = NULL;
1504   PetscBool       done;
1505 
1506   PetscFunctionBegin;
1507   PetscCall(MatGetLocalSize(A, &m, &n));
1508   PetscCall(ISGetIndices(rowp, &rwant));
1509   PetscCall(ISGetIndices(colp, &cwant));
1510   PetscCall(PetscMalloc3(PetscMax(m, n), &work, m, &rdest, n, &cdest));
1511 
1512   /* Invert row permutation to find out where my rows should go */
1513   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &rowsf));
1514   PetscCall(PetscSFSetGraphLayout(rowsf, A->rmap, A->rmap->n, NULL, PETSC_OWN_POINTER, rwant));
1515   PetscCall(PetscSFSetFromOptions(rowsf));
1516   for (i = 0; i < m; i++) work[i] = A->rmap->rstart + i;
1517   PetscCall(PetscSFReduceBegin(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1518   PetscCall(PetscSFReduceEnd(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1519 
1520   /* Invert column permutation to find out where my columns should go */
1521   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1522   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, A->cmap->n, NULL, PETSC_OWN_POINTER, cwant));
1523   PetscCall(PetscSFSetFromOptions(sf));
1524   for (i = 0; i < n; i++) work[i] = A->cmap->rstart + i;
1525   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1526   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1527   PetscCall(PetscSFDestroy(&sf));
1528 
1529   PetscCall(ISRestoreIndices(rowp, &rwant));
1530   PetscCall(ISRestoreIndices(colp, &cwant));
1531   PetscCall(MatMPIAIJGetSeqAIJ(A, &aA, &aB, &gcols));
1532 
1533   /* Find out where my gcols should go */
1534   PetscCall(MatGetSize(aB, NULL, &ng));
1535   PetscCall(PetscMalloc1(ng, &gcdest));
1536   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1537   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, ng, NULL, PETSC_OWN_POINTER, gcols));
1538   PetscCall(PetscSFSetFromOptions(sf));
1539   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1540   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1541   PetscCall(PetscSFDestroy(&sf));
1542 
1543   PetscCall(PetscCalloc4(m, &dnnz, m, &onnz, m, &tdnnz, m, &tonnz));
1544   PetscCall(MatGetRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1545   PetscCall(MatGetRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1546   for (i = 0; i < m; i++) {
1547     PetscInt    row = rdest[i];
1548     PetscMPIInt rowner;
1549     PetscCall(PetscLayoutFindOwner(A->rmap, row, &rowner));
1550     for (j = ai[i]; j < ai[i + 1]; j++) {
1551       PetscInt    col = cdest[aj[j]];
1552       PetscMPIInt cowner;
1553       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner)); /* Could build an index for the columns to eliminate this search */
1554       if (rowner == cowner) dnnz[i]++;
1555       else onnz[i]++;
1556     }
1557     for (j = bi[i]; j < bi[i + 1]; j++) {
1558       PetscInt    col = gcdest[bj[j]];
1559       PetscMPIInt cowner;
1560       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner));
1561       if (rowner == cowner) dnnz[i]++;
1562       else onnz[i]++;
1563     }
1564   }
1565   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1566   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1567   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1568   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1569   PetscCall(PetscSFDestroy(&rowsf));
1570 
1571   PetscCall(MatCreateAIJ(PetscObjectComm((PetscObject)A), A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N, 0, tdnnz, 0, tonnz, &Aperm));
1572   PetscCall(MatSeqAIJGetArray(aA, &aa));
1573   PetscCall(MatSeqAIJGetArray(aB, &ba));
1574   for (i = 0; i < m; i++) {
1575     PetscInt *acols = dnnz, *bcols = onnz; /* Repurpose now-unneeded arrays */
1576     PetscInt  j0, rowlen;
1577     rowlen = ai[i + 1] - ai[i];
1578     for (j0 = j = 0; j < rowlen; j0 = j) { /* rowlen could be larger than number of rows m, so sum in batches */
1579       for (; j < PetscMin(rowlen, j0 + m); j++) acols[j - j0] = cdest[aj[ai[i] + j]];
1580       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, acols, aa + ai[i] + j0, INSERT_VALUES));
1581     }
1582     rowlen = bi[i + 1] - bi[i];
1583     for (j0 = j = 0; j < rowlen; j0 = j) {
1584       for (; j < PetscMin(rowlen, j0 + m); j++) bcols[j - j0] = gcdest[bj[bi[i] + j]];
1585       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, bcols, ba + bi[i] + j0, INSERT_VALUES));
1586     }
1587   }
1588   PetscCall(MatAssemblyBegin(Aperm, MAT_FINAL_ASSEMBLY));
1589   PetscCall(MatAssemblyEnd(Aperm, MAT_FINAL_ASSEMBLY));
1590   PetscCall(MatRestoreRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1591   PetscCall(MatRestoreRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1592   PetscCall(MatSeqAIJRestoreArray(aA, &aa));
1593   PetscCall(MatSeqAIJRestoreArray(aB, &ba));
1594   PetscCall(PetscFree4(dnnz, onnz, tdnnz, tonnz));
1595   PetscCall(PetscFree3(work, rdest, cdest));
1596   PetscCall(PetscFree(gcdest));
1597   if (parcolp) PetscCall(ISDestroy(&colp));
1598   *B = Aperm;
1599   PetscFunctionReturn(PETSC_SUCCESS);
1600 }
1601 
1602 static PetscErrorCode MatGetGhosts_MPIAIJ(Mat mat, PetscInt *nghosts, const PetscInt *ghosts[])
1603 {
1604   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1605 
1606   PetscFunctionBegin;
1607   PetscCall(MatGetSize(aij->B, NULL, nghosts));
1608   if (ghosts) *ghosts = aij->garray;
1609   PetscFunctionReturn(PETSC_SUCCESS);
1610 }
1611 
1612 static PetscErrorCode MatGetInfo_MPIAIJ(Mat matin, MatInfoType flag, MatInfo *info)
1613 {
1614   Mat_MPIAIJ    *mat = (Mat_MPIAIJ *)matin->data;
1615   Mat            A = mat->A, B = mat->B;
1616   PetscLogDouble isend[5], irecv[5];
1617 
1618   PetscFunctionBegin;
1619   info->block_size = 1.0;
1620   PetscCall(MatGetInfo(A, MAT_LOCAL, info));
1621 
1622   isend[0] = info->nz_used;
1623   isend[1] = info->nz_allocated;
1624   isend[2] = info->nz_unneeded;
1625   isend[3] = info->memory;
1626   isend[4] = info->mallocs;
1627 
1628   PetscCall(MatGetInfo(B, MAT_LOCAL, info));
1629 
1630   isend[0] += info->nz_used;
1631   isend[1] += info->nz_allocated;
1632   isend[2] += info->nz_unneeded;
1633   isend[3] += info->memory;
1634   isend[4] += info->mallocs;
1635   if (flag == MAT_LOCAL) {
1636     info->nz_used      = isend[0];
1637     info->nz_allocated = isend[1];
1638     info->nz_unneeded  = isend[2];
1639     info->memory       = isend[3];
1640     info->mallocs      = isend[4];
1641   } else if (flag == MAT_GLOBAL_MAX) {
1642     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)matin)));
1643 
1644     info->nz_used      = irecv[0];
1645     info->nz_allocated = irecv[1];
1646     info->nz_unneeded  = irecv[2];
1647     info->memory       = irecv[3];
1648     info->mallocs      = irecv[4];
1649   } else if (flag == MAT_GLOBAL_SUM) {
1650     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)matin)));
1651 
1652     info->nz_used      = irecv[0];
1653     info->nz_allocated = irecv[1];
1654     info->nz_unneeded  = irecv[2];
1655     info->memory       = irecv[3];
1656     info->mallocs      = irecv[4];
1657   }
1658   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1659   info->fill_ratio_needed = 0;
1660   info->factor_mallocs    = 0;
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 PetscErrorCode MatSetOption_MPIAIJ(Mat A, MatOption op, PetscBool flg)
1665 {
1666   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1667 
1668   PetscFunctionBegin;
1669   switch (op) {
1670   case MAT_NEW_NONZERO_LOCATIONS:
1671   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1672   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1673   case MAT_KEEP_NONZERO_PATTERN:
1674   case MAT_NEW_NONZERO_LOCATION_ERR:
1675   case MAT_USE_INODES:
1676   case MAT_IGNORE_ZERO_ENTRIES:
1677   case MAT_FORM_EXPLICIT_TRANSPOSE:
1678     MatCheckPreallocated(A, 1);
1679     PetscCall(MatSetOption(a->A, op, flg));
1680     PetscCall(MatSetOption(a->B, op, flg));
1681     break;
1682   case MAT_ROW_ORIENTED:
1683     MatCheckPreallocated(A, 1);
1684     a->roworiented = flg;
1685 
1686     PetscCall(MatSetOption(a->A, op, flg));
1687     PetscCall(MatSetOption(a->B, op, flg));
1688     break;
1689   case MAT_IGNORE_OFF_PROC_ENTRIES:
1690     a->donotstash = flg;
1691     break;
1692   /* Symmetry flags are handled directly by MatSetOption() and they don't affect preallocation */
1693   case MAT_SPD:
1694   case MAT_SYMMETRIC:
1695   case MAT_STRUCTURALLY_SYMMETRIC:
1696   case MAT_HERMITIAN:
1697   case MAT_SYMMETRY_ETERNAL:
1698   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1699   case MAT_SPD_ETERNAL:
1700     /* if the diagonal matrix is square it inherits some of the properties above */
1701     if (a->A && A->rmap->n == A->cmap->n) PetscCall(MatSetOption(a->A, op, flg));
1702     break;
1703   case MAT_SUBMAT_SINGLEIS:
1704     A->submat_singleis = flg;
1705     break;
1706   default:
1707     break;
1708   }
1709   PetscFunctionReturn(PETSC_SUCCESS);
1710 }
1711 
1712 PetscErrorCode MatGetRow_MPIAIJ(Mat matin, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1713 {
1714   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)matin->data;
1715   PetscScalar *vworkA, *vworkB, **pvA, **pvB, *v_p;
1716   PetscInt     i, *cworkA, *cworkB, **pcA, **pcB, cstart = matin->cmap->rstart;
1717   PetscInt     nztot, nzA, nzB, lrow, rstart = matin->rmap->rstart, rend = matin->rmap->rend;
1718   PetscInt    *cmap, *idx_p;
1719 
1720   PetscFunctionBegin;
1721   PetscCheck(!mat->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Already active");
1722   mat->getrowactive = PETSC_TRUE;
1723 
1724   if (!mat->rowvalues && (idx || v)) {
1725     /*
1726         allocate enough space to hold information from the longest row.
1727     */
1728     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *)mat->A->data, *Ba = (Mat_SeqAIJ *)mat->B->data;
1729     PetscInt    max = 1, tmp;
1730     for (i = 0; i < matin->rmap->n; i++) {
1731       tmp = Aa->i[i + 1] - Aa->i[i] + Ba->i[i + 1] - Ba->i[i];
1732       if (max < tmp) max = tmp;
1733     }
1734     PetscCall(PetscMalloc2(max, &mat->rowvalues, max, &mat->rowindices));
1735   }
1736 
1737   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only local rows");
1738   lrow = row - rstart;
1739 
1740   pvA = &vworkA;
1741   pcA = &cworkA;
1742   pvB = &vworkB;
1743   pcB = &cworkB;
1744   if (!v) {
1745     pvA = NULL;
1746     pvB = NULL;
1747   }
1748   if (!idx) {
1749     pcA = NULL;
1750     if (!v) pcB = NULL;
1751   }
1752   PetscCall((*mat->A->ops->getrow)(mat->A, lrow, &nzA, pcA, pvA));
1753   PetscCall((*mat->B->ops->getrow)(mat->B, lrow, &nzB, pcB, pvB));
1754   nztot = nzA + nzB;
1755 
1756   cmap = mat->garray;
1757   if (v || idx) {
1758     if (nztot) {
1759       /* Sort by increasing column numbers, assuming A and B already sorted */
1760       PetscInt imark = -1;
1761       if (v) {
1762         *v = v_p = mat->rowvalues;
1763         for (i = 0; i < nzB; i++) {
1764           if (cmap[cworkB[i]] < cstart) v_p[i] = vworkB[i];
1765           else break;
1766         }
1767         imark = i;
1768         for (i = 0; i < nzA; i++) v_p[imark + i] = vworkA[i];
1769         for (i = imark; i < nzB; i++) v_p[nzA + i] = vworkB[i];
1770       }
1771       if (idx) {
1772         *idx = idx_p = mat->rowindices;
1773         if (imark > -1) {
1774           for (i = 0; i < imark; i++) idx_p[i] = cmap[cworkB[i]];
1775         } else {
1776           for (i = 0; i < nzB; i++) {
1777             if (cmap[cworkB[i]] < cstart) idx_p[i] = cmap[cworkB[i]];
1778             else break;
1779           }
1780           imark = i;
1781         }
1782         for (i = 0; i < nzA; i++) idx_p[imark + i] = cstart + cworkA[i];
1783         for (i = imark; i < nzB; i++) idx_p[nzA + i] = cmap[cworkB[i]];
1784       }
1785     } else {
1786       if (idx) *idx = NULL;
1787       if (v) *v = NULL;
1788     }
1789   }
1790   *nz = nztot;
1791   PetscCall((*mat->A->ops->restorerow)(mat->A, lrow, &nzA, pcA, pvA));
1792   PetscCall((*mat->B->ops->restorerow)(mat->B, lrow, &nzB, pcB, pvB));
1793   PetscFunctionReturn(PETSC_SUCCESS);
1794 }
1795 
1796 PetscErrorCode MatRestoreRow_MPIAIJ(Mat mat, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1797 {
1798   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1799 
1800   PetscFunctionBegin;
1801   PetscCheck(aij->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "MatGetRow() must be called first");
1802   aij->getrowactive = PETSC_FALSE;
1803   PetscFunctionReturn(PETSC_SUCCESS);
1804 }
1805 
1806 static PetscErrorCode MatNorm_MPIAIJ(Mat mat, NormType type, PetscReal *norm)
1807 {
1808   Mat_MPIAIJ      *aij  = (Mat_MPIAIJ *)mat->data;
1809   Mat_SeqAIJ      *amat = (Mat_SeqAIJ *)aij->A->data, *bmat = (Mat_SeqAIJ *)aij->B->data;
1810   PetscInt         i, j;
1811   PetscReal        sum = 0.0;
1812   const MatScalar *v, *amata, *bmata;
1813 
1814   PetscFunctionBegin;
1815   if (aij->size == 1) {
1816     PetscCall(MatNorm(aij->A, type, norm));
1817   } else {
1818     PetscCall(MatSeqAIJGetArrayRead(aij->A, &amata));
1819     PetscCall(MatSeqAIJGetArrayRead(aij->B, &bmata));
1820     if (type == NORM_FROBENIUS) {
1821       v = amata;
1822       for (i = 0; i < amat->nz; i++) {
1823         sum += PetscRealPart(PetscConj(*v) * (*v));
1824         v++;
1825       }
1826       v = bmata;
1827       for (i = 0; i < bmat->nz; i++) {
1828         sum += PetscRealPart(PetscConj(*v) * (*v));
1829         v++;
1830       }
1831       PetscCallMPI(MPIU_Allreduce(&sum, norm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1832       *norm = PetscSqrtReal(*norm);
1833       PetscCall(PetscLogFlops(2.0 * amat->nz + 2.0 * bmat->nz));
1834     } else if (type == NORM_1) { /* max column norm */
1835       Vec          col, bcol;
1836       PetscScalar *array;
1837       PetscInt    *jj, *garray = aij->garray;
1838 
1839       PetscCall(MatCreateVecs(mat, &col, NULL));
1840       PetscCall(VecSet(col, 0.0));
1841       PetscCall(VecGetArrayWrite(col, &array));
1842       v  = amata;
1843       jj = amat->j;
1844       for (j = 0; j < amat->nz; j++) array[*jj++] += PetscAbsScalar(*v++);
1845       PetscCall(VecRestoreArrayWrite(col, &array));
1846       PetscCall(MatCreateVecs(aij->B, &bcol, NULL));
1847       PetscCall(VecSet(bcol, 0.0));
1848       PetscCall(VecGetArrayWrite(bcol, &array));
1849       v  = bmata;
1850       jj = bmat->j;
1851       for (j = 0; j < bmat->nz; j++) array[*jj++] += PetscAbsScalar(*v++);
1852       PetscCall(VecSetValues(col, aij->B->cmap->n, garray, array, ADD_VALUES));
1853       PetscCall(VecRestoreArrayWrite(bcol, &array));
1854       PetscCall(VecDestroy(&bcol));
1855       PetscCall(VecAssemblyBegin(col));
1856       PetscCall(VecAssemblyEnd(col));
1857       PetscCall(VecNorm(col, NORM_INFINITY, norm));
1858       PetscCall(VecDestroy(&col));
1859     } else if (type == NORM_INFINITY) { /* max row norm */
1860       PetscReal ntemp = 0.0;
1861       for (j = 0; j < aij->A->rmap->n; j++) {
1862         v   = PetscSafePointerPlusOffset(amata, amat->i[j]);
1863         sum = 0.0;
1864         for (i = 0; i < amat->i[j + 1] - amat->i[j]; i++) {
1865           sum += PetscAbsScalar(*v);
1866           v++;
1867         }
1868         v = PetscSafePointerPlusOffset(bmata, bmat->i[j]);
1869         for (i = 0; i < bmat->i[j + 1] - bmat->i[j]; i++) {
1870           sum += PetscAbsScalar(*v);
1871           v++;
1872         }
1873         if (sum > ntemp) ntemp = sum;
1874       }
1875       PetscCallMPI(MPIU_Allreduce(&ntemp, norm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)mat)));
1876       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1877     } else SETERRQ(PetscObjectComm((PetscObject)mat), PETSC_ERR_SUP, "No support for two norm");
1878     PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &amata));
1879     PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &bmata));
1880   }
1881   PetscFunctionReturn(PETSC_SUCCESS);
1882 }
1883 
1884 static PetscErrorCode MatTranspose_MPIAIJ(Mat A, MatReuse reuse, Mat *matout)
1885 {
1886   Mat_MPIAIJ      *a    = (Mat_MPIAIJ *)A->data, *b;
1887   Mat_SeqAIJ      *Aloc = (Mat_SeqAIJ *)a->A->data, *Bloc = (Mat_SeqAIJ *)a->B->data, *sub_B_diag;
1888   PetscInt         M = A->rmap->N, N = A->cmap->N, ma, na, mb, nb, row, *cols, *cols_tmp, *B_diag_ilen, i, ncol, A_diag_ncol;
1889   const PetscInt  *ai, *aj, *bi, *bj, *B_diag_i;
1890   Mat              B, A_diag, *B_diag;
1891   const MatScalar *pbv, *bv;
1892 
1893   PetscFunctionBegin;
1894   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1895   ma = A->rmap->n;
1896   na = A->cmap->n;
1897   mb = a->B->rmap->n;
1898   nb = a->B->cmap->n;
1899   ai = Aloc->i;
1900   aj = Aloc->j;
1901   bi = Bloc->i;
1902   bj = Bloc->j;
1903   if (reuse == MAT_INITIAL_MATRIX || *matout == A) {
1904     PetscInt            *d_nnz, *g_nnz, *o_nnz;
1905     PetscSFNode         *oloc;
1906     PETSC_UNUSED PetscSF sf;
1907 
1908     PetscCall(PetscMalloc4(na, &d_nnz, na, &o_nnz, nb, &g_nnz, nb, &oloc));
1909     /* compute d_nnz for preallocation */
1910     PetscCall(PetscArrayzero(d_nnz, na));
1911     for (i = 0; i < ai[ma]; i++) d_nnz[aj[i]]++;
1912     /* compute local off-diagonal contributions */
1913     PetscCall(PetscArrayzero(g_nnz, nb));
1914     for (i = 0; i < bi[ma]; i++) g_nnz[bj[i]]++;
1915     /* map those to global */
1916     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1917     PetscCall(PetscSFSetGraphLayout(sf, A->cmap, nb, NULL, PETSC_USE_POINTER, a->garray));
1918     PetscCall(PetscSFSetFromOptions(sf));
1919     PetscCall(PetscArrayzero(o_nnz, na));
1920     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1921     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1922     PetscCall(PetscSFDestroy(&sf));
1923 
1924     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1925     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1926     PetscCall(MatSetBlockSizes(B, A->cmap->bs, A->rmap->bs));
1927     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1928     PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
1929     PetscCall(PetscFree4(d_nnz, o_nnz, g_nnz, oloc));
1930   } else {
1931     B = *matout;
1932     PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
1933   }
1934 
1935   b           = (Mat_MPIAIJ *)B->data;
1936   A_diag      = a->A;
1937   B_diag      = &b->A;
1938   sub_B_diag  = (Mat_SeqAIJ *)(*B_diag)->data;
1939   A_diag_ncol = A_diag->cmap->N;
1940   B_diag_ilen = sub_B_diag->ilen;
1941   B_diag_i    = sub_B_diag->i;
1942 
1943   /* Set ilen for diagonal of B */
1944   for (i = 0; i < A_diag_ncol; i++) B_diag_ilen[i] = B_diag_i[i + 1] - B_diag_i[i];
1945 
1946   /* Transpose the diagonal part of the matrix. In contrast to the off-diagonal part, this can be done
1947   very quickly (=without using MatSetValues), because all writes are local. */
1948   PetscCall(MatTransposeSetPrecursor(A_diag, *B_diag));
1949   PetscCall(MatTranspose(A_diag, MAT_REUSE_MATRIX, B_diag));
1950 
1951   /* copy over the B part */
1952   PetscCall(PetscMalloc1(bi[mb], &cols));
1953   PetscCall(MatSeqAIJGetArrayRead(a->B, &bv));
1954   pbv = bv;
1955   row = A->rmap->rstart;
1956   for (i = 0; i < bi[mb]; i++) cols[i] = a->garray[bj[i]];
1957   cols_tmp = cols;
1958   for (i = 0; i < mb; i++) {
1959     ncol = bi[i + 1] - bi[i];
1960     PetscCall(MatSetValues(B, ncol, cols_tmp, 1, &row, pbv, INSERT_VALUES));
1961     row++;
1962     if (pbv) pbv += ncol;
1963     if (cols_tmp) cols_tmp += ncol;
1964   }
1965   PetscCall(PetscFree(cols));
1966   PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bv));
1967 
1968   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1969   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1970   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1971     *matout = B;
1972   } else {
1973     PetscCall(MatHeaderMerge(A, &B));
1974   }
1975   PetscFunctionReturn(PETSC_SUCCESS);
1976 }
1977 
1978 static PetscErrorCode MatDiagonalScale_MPIAIJ(Mat mat, Vec ll, Vec rr)
1979 {
1980   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1981   Mat         a = aij->A, b = aij->B;
1982   PetscInt    s1, s2, s3;
1983 
1984   PetscFunctionBegin;
1985   PetscCall(MatGetLocalSize(mat, &s2, &s3));
1986   if (rr) {
1987     PetscCall(VecGetLocalSize(rr, &s1));
1988     PetscCheck(s1 == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "right vector non-conforming local size");
1989     /* Overlap communication with computation. */
1990     PetscCall(VecScatterBegin(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
1991   }
1992   if (ll) {
1993     PetscCall(VecGetLocalSize(ll, &s1));
1994     PetscCheck(s1 == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "left vector non-conforming local size");
1995     PetscUseTypeMethod(b, diagonalscale, ll, NULL);
1996   }
1997   /* scale  the diagonal block */
1998   PetscUseTypeMethod(a, diagonalscale, ll, rr);
1999 
2000   if (rr) {
2001     /* Do a scatter end and then right scale the off-diagonal block */
2002     PetscCall(VecScatterEnd(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
2003     PetscUseTypeMethod(b, diagonalscale, NULL, aij->lvec);
2004   }
2005   PetscFunctionReturn(PETSC_SUCCESS);
2006 }
2007 
2008 static PetscErrorCode MatSetUnfactored_MPIAIJ(Mat A)
2009 {
2010   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2011 
2012   PetscFunctionBegin;
2013   PetscCall(MatSetUnfactored(a->A));
2014   PetscFunctionReturn(PETSC_SUCCESS);
2015 }
2016 
2017 static PetscErrorCode MatEqual_MPIAIJ(Mat A, Mat B, PetscBool *flag)
2018 {
2019   Mat_MPIAIJ *matB = (Mat_MPIAIJ *)B->data, *matA = (Mat_MPIAIJ *)A->data;
2020   Mat         a, b, c, d;
2021   PetscBool   flg;
2022 
2023   PetscFunctionBegin;
2024   a = matA->A;
2025   b = matA->B;
2026   c = matB->A;
2027   d = matB->B;
2028 
2029   PetscCall(MatEqual(a, c, &flg));
2030   if (flg) PetscCall(MatEqual(b, d, &flg));
2031   PetscCallMPI(MPIU_Allreduce(&flg, flag, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2032   PetscFunctionReturn(PETSC_SUCCESS);
2033 }
2034 
2035 static PetscErrorCode MatCopy_MPIAIJ(Mat A, Mat B, MatStructure str)
2036 {
2037   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2038   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2039 
2040   PetscFunctionBegin;
2041   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
2042   if ((str != SAME_NONZERO_PATTERN) || (A->ops->copy != B->ops->copy)) {
2043     /* because of the column compression in the off-processor part of the matrix a->B,
2044        the number of columns in a->B and b->B may be different, hence we cannot call
2045        the MatCopy() directly on the two parts. If need be, we can provide a more
2046        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
2047        then copying the submatrices */
2048     PetscCall(MatCopy_Basic(A, B, str));
2049   } else {
2050     PetscCall(MatCopy(a->A, b->A, str));
2051     PetscCall(MatCopy(a->B, b->B, str));
2052   }
2053   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2054   PetscFunctionReturn(PETSC_SUCCESS);
2055 }
2056 
2057 /*
2058    Computes the number of nonzeros per row needed for preallocation when X and Y
2059    have different nonzero structure.
2060 */
2061 PetscErrorCode MatAXPYGetPreallocation_MPIX_private(PetscInt m, const PetscInt *xi, const PetscInt *xj, const PetscInt *xltog, const PetscInt *yi, const PetscInt *yj, const PetscInt *yltog, PetscInt *nnz)
2062 {
2063   PetscInt i, j, k, nzx, nzy;
2064 
2065   PetscFunctionBegin;
2066   /* Set the number of nonzeros in the new matrix */
2067   for (i = 0; i < m; i++) {
2068     const PetscInt *xjj = PetscSafePointerPlusOffset(xj, xi[i]), *yjj = PetscSafePointerPlusOffset(yj, yi[i]);
2069     nzx    = xi[i + 1] - xi[i];
2070     nzy    = yi[i + 1] - yi[i];
2071     nnz[i] = 0;
2072     for (j = 0, k = 0; j < nzx; j++) {                                /* Point in X */
2073       for (; k < nzy && yltog[yjj[k]] < xltog[xjj[j]]; k++) nnz[i]++; /* Catch up to X */
2074       if (k < nzy && yltog[yjj[k]] == xltog[xjj[j]]) k++;             /* Skip duplicate */
2075       nnz[i]++;
2076     }
2077     for (; k < nzy; k++) nnz[i]++;
2078   }
2079   PetscFunctionReturn(PETSC_SUCCESS);
2080 }
2081 
2082 /* This is the same as MatAXPYGetPreallocation_SeqAIJ, except that the local-to-global map is provided */
2083 static PetscErrorCode MatAXPYGetPreallocation_MPIAIJ(Mat Y, const PetscInt *yltog, Mat X, const PetscInt *xltog, PetscInt *nnz)
2084 {
2085   PetscInt    m = Y->rmap->N;
2086   Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data;
2087   Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
2088 
2089   PetscFunctionBegin;
2090   PetscCall(MatAXPYGetPreallocation_MPIX_private(m, x->i, x->j, xltog, y->i, y->j, yltog, nnz));
2091   PetscFunctionReturn(PETSC_SUCCESS);
2092 }
2093 
2094 static PetscErrorCode MatAXPY_MPIAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2095 {
2096   Mat_MPIAIJ *xx = (Mat_MPIAIJ *)X->data, *yy = (Mat_MPIAIJ *)Y->data;
2097 
2098   PetscFunctionBegin;
2099   if (str == SAME_NONZERO_PATTERN) {
2100     PetscCall(MatAXPY(yy->A, a, xx->A, str));
2101     PetscCall(MatAXPY(yy->B, a, xx->B, str));
2102   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2103     PetscCall(MatAXPY_Basic(Y, a, X, str));
2104   } else {
2105     Mat       B;
2106     PetscInt *nnz_d, *nnz_o;
2107 
2108     PetscCall(PetscMalloc1(yy->A->rmap->N, &nnz_d));
2109     PetscCall(PetscMalloc1(yy->B->rmap->N, &nnz_o));
2110     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2111     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2112     PetscCall(MatSetLayouts(B, Y->rmap, Y->cmap));
2113     PetscCall(MatSetType(B, ((PetscObject)Y)->type_name));
2114     PetscCall(MatAXPYGetPreallocation_SeqAIJ(yy->A, xx->A, nnz_d));
2115     PetscCall(MatAXPYGetPreallocation_MPIAIJ(yy->B, yy->garray, xx->B, xx->garray, nnz_o));
2116     PetscCall(MatMPIAIJSetPreallocation(B, 0, nnz_d, 0, nnz_o));
2117     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2118     PetscCall(MatHeaderMerge(Y, &B));
2119     PetscCall(PetscFree(nnz_d));
2120     PetscCall(PetscFree(nnz_o));
2121   }
2122   PetscFunctionReturn(PETSC_SUCCESS);
2123 }
2124 
2125 PETSC_INTERN PetscErrorCode MatConjugate_SeqAIJ(Mat);
2126 
2127 static PetscErrorCode MatConjugate_MPIAIJ(Mat mat)
2128 {
2129   PetscFunctionBegin;
2130   if (PetscDefined(USE_COMPLEX)) {
2131     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2132 
2133     PetscCall(MatConjugate_SeqAIJ(aij->A));
2134     PetscCall(MatConjugate_SeqAIJ(aij->B));
2135   }
2136   PetscFunctionReturn(PETSC_SUCCESS);
2137 }
2138 
2139 static PetscErrorCode MatRealPart_MPIAIJ(Mat A)
2140 {
2141   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2142 
2143   PetscFunctionBegin;
2144   PetscCall(MatRealPart(a->A));
2145   PetscCall(MatRealPart(a->B));
2146   PetscFunctionReturn(PETSC_SUCCESS);
2147 }
2148 
2149 static PetscErrorCode MatImaginaryPart_MPIAIJ(Mat A)
2150 {
2151   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2152 
2153   PetscFunctionBegin;
2154   PetscCall(MatImaginaryPart(a->A));
2155   PetscCall(MatImaginaryPart(a->B));
2156   PetscFunctionReturn(PETSC_SUCCESS);
2157 }
2158 
2159 static PetscErrorCode MatGetRowMaxAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2160 {
2161   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
2162   PetscInt           i, *idxb = NULL, m = A->rmap->n;
2163   PetscScalar       *vv;
2164   Vec                vB, vA;
2165   const PetscScalar *va, *vb;
2166 
2167   PetscFunctionBegin;
2168   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2169   PetscCall(MatGetRowMaxAbs(a->A, vA, idx));
2170 
2171   PetscCall(VecGetArrayRead(vA, &va));
2172   if (idx) {
2173     for (i = 0; i < m; i++) {
2174       if (PetscAbsScalar(va[i])) idx[i] += A->cmap->rstart;
2175     }
2176   }
2177 
2178   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2179   PetscCall(PetscMalloc1(m, &idxb));
2180   PetscCall(MatGetRowMaxAbs(a->B, vB, idxb));
2181 
2182   PetscCall(VecGetArrayWrite(v, &vv));
2183   PetscCall(VecGetArrayRead(vB, &vb));
2184   for (i = 0; i < m; i++) {
2185     if (PetscAbsScalar(va[i]) < PetscAbsScalar(vb[i])) {
2186       vv[i] = vb[i];
2187       if (idx) idx[i] = a->garray[idxb[i]];
2188     } else {
2189       vv[i] = va[i];
2190       if (idx && PetscAbsScalar(va[i]) == PetscAbsScalar(vb[i]) && idxb[i] != -1 && idx[i] > a->garray[idxb[i]]) idx[i] = a->garray[idxb[i]];
2191     }
2192   }
2193   PetscCall(VecRestoreArrayWrite(v, &vv));
2194   PetscCall(VecRestoreArrayRead(vA, &va));
2195   PetscCall(VecRestoreArrayRead(vB, &vb));
2196   PetscCall(PetscFree(idxb));
2197   PetscCall(VecDestroy(&vA));
2198   PetscCall(VecDestroy(&vB));
2199   PetscFunctionReturn(PETSC_SUCCESS);
2200 }
2201 
2202 static PetscErrorCode MatGetRowSumAbs_MPIAIJ(Mat A, Vec v)
2203 {
2204   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2205   Vec         vB, vA;
2206 
2207   PetscFunctionBegin;
2208   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2209   PetscCall(MatGetRowSumAbs(a->A, vA));
2210   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2211   PetscCall(MatGetRowSumAbs(a->B, vB));
2212   PetscCall(VecAXPY(vA, 1.0, vB));
2213   PetscCall(VecDestroy(&vB));
2214   PetscCall(VecCopy(vA, v));
2215   PetscCall(VecDestroy(&vA));
2216   PetscFunctionReturn(PETSC_SUCCESS);
2217 }
2218 
2219 static PetscErrorCode MatGetRowMinAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2220 {
2221   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2222   PetscInt           m = A->rmap->n, n = A->cmap->n;
2223   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2224   PetscInt          *cmap = mat->garray;
2225   PetscInt          *diagIdx, *offdiagIdx;
2226   Vec                diagV, offdiagV;
2227   PetscScalar       *a, *diagA, *offdiagA;
2228   const PetscScalar *ba, *bav;
2229   PetscInt           r, j, col, ncols, *bi, *bj;
2230   Mat                B = mat->B;
2231   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2232 
2233   PetscFunctionBegin;
2234   /* When a process holds entire A and other processes have no entry */
2235   if (A->cmap->N == n) {
2236     PetscCall(VecGetArrayWrite(v, &diagA));
2237     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2238     PetscCall(MatGetRowMinAbs(mat->A, diagV, idx));
2239     PetscCall(VecDestroy(&diagV));
2240     PetscCall(VecRestoreArrayWrite(v, &diagA));
2241     PetscFunctionReturn(PETSC_SUCCESS);
2242   } else if (n == 0) {
2243     if (m) {
2244       PetscCall(VecGetArrayWrite(v, &a));
2245       for (r = 0; r < m; r++) {
2246         a[r] = 0.0;
2247         if (idx) idx[r] = -1;
2248       }
2249       PetscCall(VecRestoreArrayWrite(v, &a));
2250     }
2251     PetscFunctionReturn(PETSC_SUCCESS);
2252   }
2253 
2254   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2255   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2256   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2257   PetscCall(MatGetRowMinAbs(mat->A, diagV, diagIdx));
2258 
2259   /* Get offdiagIdx[] for implicit 0.0 */
2260   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2261   ba = bav;
2262   bi = b->i;
2263   bj = b->j;
2264   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2265   for (r = 0; r < m; r++) {
2266     ncols = bi[r + 1] - bi[r];
2267     if (ncols == A->cmap->N - n) { /* Brow is dense */
2268       offdiagA[r]   = *ba;
2269       offdiagIdx[r] = cmap[0];
2270     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2271       offdiagA[r] = 0.0;
2272 
2273       /* Find first hole in the cmap */
2274       for (j = 0; j < ncols; j++) {
2275         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2276         if (col > j && j < cstart) {
2277           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2278           break;
2279         } else if (col > j + n && j >= cstart) {
2280           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2281           break;
2282         }
2283       }
2284       if (j == ncols && ncols < A->cmap->N - n) {
2285         /* a hole is outside compressed Bcols */
2286         if (ncols == 0) {
2287           if (cstart) {
2288             offdiagIdx[r] = 0;
2289           } else offdiagIdx[r] = cend;
2290         } else { /* ncols > 0 */
2291           offdiagIdx[r] = cmap[ncols - 1] + 1;
2292           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2293         }
2294       }
2295     }
2296 
2297     for (j = 0; j < ncols; j++) {
2298       if (PetscAbsScalar(offdiagA[r]) > PetscAbsScalar(*ba)) {
2299         offdiagA[r]   = *ba;
2300         offdiagIdx[r] = cmap[*bj];
2301       }
2302       ba++;
2303       bj++;
2304     }
2305   }
2306 
2307   PetscCall(VecGetArrayWrite(v, &a));
2308   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2309   for (r = 0; r < m; ++r) {
2310     if (PetscAbsScalar(diagA[r]) < PetscAbsScalar(offdiagA[r])) {
2311       a[r] = diagA[r];
2312       if (idx) idx[r] = cstart + diagIdx[r];
2313     } else if (PetscAbsScalar(diagA[r]) == PetscAbsScalar(offdiagA[r])) {
2314       a[r] = diagA[r];
2315       if (idx) {
2316         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2317           idx[r] = cstart + diagIdx[r];
2318         } else idx[r] = offdiagIdx[r];
2319       }
2320     } else {
2321       a[r] = offdiagA[r];
2322       if (idx) idx[r] = offdiagIdx[r];
2323     }
2324   }
2325   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2326   PetscCall(VecRestoreArrayWrite(v, &a));
2327   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2328   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2329   PetscCall(VecDestroy(&diagV));
2330   PetscCall(VecDestroy(&offdiagV));
2331   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2332   PetscFunctionReturn(PETSC_SUCCESS);
2333 }
2334 
2335 static PetscErrorCode MatGetRowMin_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2336 {
2337   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2338   PetscInt           m = A->rmap->n, n = A->cmap->n;
2339   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2340   PetscInt          *cmap = mat->garray;
2341   PetscInt          *diagIdx, *offdiagIdx;
2342   Vec                diagV, offdiagV;
2343   PetscScalar       *a, *diagA, *offdiagA;
2344   const PetscScalar *ba, *bav;
2345   PetscInt           r, j, col, ncols, *bi, *bj;
2346   Mat                B = mat->B;
2347   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2348 
2349   PetscFunctionBegin;
2350   /* When a process holds entire A and other processes have no entry */
2351   if (A->cmap->N == n) {
2352     PetscCall(VecGetArrayWrite(v, &diagA));
2353     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2354     PetscCall(MatGetRowMin(mat->A, diagV, idx));
2355     PetscCall(VecDestroy(&diagV));
2356     PetscCall(VecRestoreArrayWrite(v, &diagA));
2357     PetscFunctionReturn(PETSC_SUCCESS);
2358   } else if (n == 0) {
2359     if (m) {
2360       PetscCall(VecGetArrayWrite(v, &a));
2361       for (r = 0; r < m; r++) {
2362         a[r] = PETSC_MAX_REAL;
2363         if (idx) idx[r] = -1;
2364       }
2365       PetscCall(VecRestoreArrayWrite(v, &a));
2366     }
2367     PetscFunctionReturn(PETSC_SUCCESS);
2368   }
2369 
2370   PetscCall(PetscCalloc2(m, &diagIdx, m, &offdiagIdx));
2371   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2372   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2373   PetscCall(MatGetRowMin(mat->A, diagV, diagIdx));
2374 
2375   /* Get offdiagIdx[] for implicit 0.0 */
2376   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2377   ba = bav;
2378   bi = b->i;
2379   bj = b->j;
2380   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2381   for (r = 0; r < m; r++) {
2382     ncols = bi[r + 1] - bi[r];
2383     if (ncols == A->cmap->N - n) { /* Brow is dense */
2384       offdiagA[r]   = *ba;
2385       offdiagIdx[r] = cmap[0];
2386     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2387       offdiagA[r] = 0.0;
2388 
2389       /* Find first hole in the cmap */
2390       for (j = 0; j < ncols; j++) {
2391         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2392         if (col > j && j < cstart) {
2393           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2394           break;
2395         } else if (col > j + n && j >= cstart) {
2396           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2397           break;
2398         }
2399       }
2400       if (j == ncols && ncols < A->cmap->N - n) {
2401         /* a hole is outside compressed Bcols */
2402         if (ncols == 0) {
2403           if (cstart) {
2404             offdiagIdx[r] = 0;
2405           } else offdiagIdx[r] = cend;
2406         } else { /* ncols > 0 */
2407           offdiagIdx[r] = cmap[ncols - 1] + 1;
2408           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2409         }
2410       }
2411     }
2412 
2413     for (j = 0; j < ncols; j++) {
2414       if (PetscRealPart(offdiagA[r]) > PetscRealPart(*ba)) {
2415         offdiagA[r]   = *ba;
2416         offdiagIdx[r] = cmap[*bj];
2417       }
2418       ba++;
2419       bj++;
2420     }
2421   }
2422 
2423   PetscCall(VecGetArrayWrite(v, &a));
2424   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2425   for (r = 0; r < m; ++r) {
2426     if (PetscRealPart(diagA[r]) < PetscRealPart(offdiagA[r])) {
2427       a[r] = diagA[r];
2428       if (idx) idx[r] = cstart + diagIdx[r];
2429     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2430       a[r] = diagA[r];
2431       if (idx) {
2432         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2433           idx[r] = cstart + diagIdx[r];
2434         } else idx[r] = offdiagIdx[r];
2435       }
2436     } else {
2437       a[r] = offdiagA[r];
2438       if (idx) idx[r] = offdiagIdx[r];
2439     }
2440   }
2441   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2442   PetscCall(VecRestoreArrayWrite(v, &a));
2443   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2444   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2445   PetscCall(VecDestroy(&diagV));
2446   PetscCall(VecDestroy(&offdiagV));
2447   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2448   PetscFunctionReturn(PETSC_SUCCESS);
2449 }
2450 
2451 static PetscErrorCode MatGetRowMax_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2452 {
2453   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2454   PetscInt           m = A->rmap->n, n = A->cmap->n;
2455   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2456   PetscInt          *cmap = mat->garray;
2457   PetscInt          *diagIdx, *offdiagIdx;
2458   Vec                diagV, offdiagV;
2459   PetscScalar       *a, *diagA, *offdiagA;
2460   const PetscScalar *ba, *bav;
2461   PetscInt           r, j, col, ncols, *bi, *bj;
2462   Mat                B = mat->B;
2463   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2464 
2465   PetscFunctionBegin;
2466   /* When a process holds entire A and other processes have no entry */
2467   if (A->cmap->N == n) {
2468     PetscCall(VecGetArrayWrite(v, &diagA));
2469     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2470     PetscCall(MatGetRowMax(mat->A, diagV, idx));
2471     PetscCall(VecDestroy(&diagV));
2472     PetscCall(VecRestoreArrayWrite(v, &diagA));
2473     PetscFunctionReturn(PETSC_SUCCESS);
2474   } else if (n == 0) {
2475     if (m) {
2476       PetscCall(VecGetArrayWrite(v, &a));
2477       for (r = 0; r < m; r++) {
2478         a[r] = PETSC_MIN_REAL;
2479         if (idx) idx[r] = -1;
2480       }
2481       PetscCall(VecRestoreArrayWrite(v, &a));
2482     }
2483     PetscFunctionReturn(PETSC_SUCCESS);
2484   }
2485 
2486   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2487   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2488   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2489   PetscCall(MatGetRowMax(mat->A, diagV, diagIdx));
2490 
2491   /* Get offdiagIdx[] for implicit 0.0 */
2492   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2493   ba = bav;
2494   bi = b->i;
2495   bj = b->j;
2496   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2497   for (r = 0; r < m; r++) {
2498     ncols = bi[r + 1] - bi[r];
2499     if (ncols == A->cmap->N - n) { /* Brow is dense */
2500       offdiagA[r]   = *ba;
2501       offdiagIdx[r] = cmap[0];
2502     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2503       offdiagA[r] = 0.0;
2504 
2505       /* Find first hole in the cmap */
2506       for (j = 0; j < ncols; j++) {
2507         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2508         if (col > j && j < cstart) {
2509           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2510           break;
2511         } else if (col > j + n && j >= cstart) {
2512           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2513           break;
2514         }
2515       }
2516       if (j == ncols && ncols < A->cmap->N - n) {
2517         /* a hole is outside compressed Bcols */
2518         if (ncols == 0) {
2519           if (cstart) {
2520             offdiagIdx[r] = 0;
2521           } else offdiagIdx[r] = cend;
2522         } else { /* ncols > 0 */
2523           offdiagIdx[r] = cmap[ncols - 1] + 1;
2524           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2525         }
2526       }
2527     }
2528 
2529     for (j = 0; j < ncols; j++) {
2530       if (PetscRealPart(offdiagA[r]) < PetscRealPart(*ba)) {
2531         offdiagA[r]   = *ba;
2532         offdiagIdx[r] = cmap[*bj];
2533       }
2534       ba++;
2535       bj++;
2536     }
2537   }
2538 
2539   PetscCall(VecGetArrayWrite(v, &a));
2540   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2541   for (r = 0; r < m; ++r) {
2542     if (PetscRealPart(diagA[r]) > PetscRealPart(offdiagA[r])) {
2543       a[r] = diagA[r];
2544       if (idx) idx[r] = cstart + diagIdx[r];
2545     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2546       a[r] = diagA[r];
2547       if (idx) {
2548         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2549           idx[r] = cstart + diagIdx[r];
2550         } else idx[r] = offdiagIdx[r];
2551       }
2552     } else {
2553       a[r] = offdiagA[r];
2554       if (idx) idx[r] = offdiagIdx[r];
2555     }
2556   }
2557   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2558   PetscCall(VecRestoreArrayWrite(v, &a));
2559   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2560   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2561   PetscCall(VecDestroy(&diagV));
2562   PetscCall(VecDestroy(&offdiagV));
2563   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2564   PetscFunctionReturn(PETSC_SUCCESS);
2565 }
2566 
2567 PetscErrorCode MatGetSeqNonzeroStructure_MPIAIJ(Mat mat, Mat *newmat)
2568 {
2569   Mat *dummy;
2570 
2571   PetscFunctionBegin;
2572   PetscCall(MatCreateSubMatrix_MPIAIJ_All(mat, MAT_DO_NOT_GET_VALUES, MAT_INITIAL_MATRIX, &dummy));
2573   *newmat = *dummy;
2574   PetscCall(PetscFree(dummy));
2575   PetscFunctionReturn(PETSC_SUCCESS);
2576 }
2577 
2578 static PetscErrorCode MatInvertBlockDiagonal_MPIAIJ(Mat A, const PetscScalar **values)
2579 {
2580   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2581 
2582   PetscFunctionBegin;
2583   PetscCall(MatInvertBlockDiagonal(a->A, values));
2584   A->factorerrortype = a->A->factorerrortype;
2585   PetscFunctionReturn(PETSC_SUCCESS);
2586 }
2587 
2588 static PetscErrorCode MatSetRandom_MPIAIJ(Mat x, PetscRandom rctx)
2589 {
2590   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)x->data;
2591 
2592   PetscFunctionBegin;
2593   PetscCheck(x->assembled || x->preallocated, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "MatSetRandom on an unassembled and unpreallocated MATMPIAIJ is not allowed");
2594   PetscCall(MatSetRandom(aij->A, rctx));
2595   if (x->assembled) {
2596     PetscCall(MatSetRandom(aij->B, rctx));
2597   } else {
2598     PetscCall(MatSetRandomSkipColumnRange_SeqAIJ_Private(aij->B, x->cmap->rstart, x->cmap->rend, rctx));
2599   }
2600   PetscCall(MatAssemblyBegin(x, MAT_FINAL_ASSEMBLY));
2601   PetscCall(MatAssemblyEnd(x, MAT_FINAL_ASSEMBLY));
2602   PetscFunctionReturn(PETSC_SUCCESS);
2603 }
2604 
2605 static PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ(Mat A, PetscBool sc)
2606 {
2607   PetscFunctionBegin;
2608   if (sc) A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ_Scalable;
2609   else A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ;
2610   PetscFunctionReturn(PETSC_SUCCESS);
2611 }
2612 
2613 /*@
2614   MatMPIAIJGetNumberNonzeros - gets the number of nonzeros in the matrix on this MPI rank
2615 
2616   Not Collective
2617 
2618   Input Parameter:
2619 . A - the matrix
2620 
2621   Output Parameter:
2622 . nz - the number of nonzeros
2623 
2624   Level: advanced
2625 
2626 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2627 @*/
2628 PetscErrorCode MatMPIAIJGetNumberNonzeros(Mat A, PetscCount *nz)
2629 {
2630   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)A->data;
2631   Mat_SeqAIJ *aaij = (Mat_SeqAIJ *)maij->A->data, *baij = (Mat_SeqAIJ *)maij->B->data;
2632   PetscBool   isaij;
2633 
2634   PetscFunctionBegin;
2635   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIAIJ, &isaij));
2636   PetscCheck(isaij, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
2637   *nz = aaij->i[A->rmap->n] + baij->i[A->rmap->n];
2638   PetscFunctionReturn(PETSC_SUCCESS);
2639 }
2640 
2641 /*@
2642   MatMPIAIJSetUseScalableIncreaseOverlap - Determine if the matrix uses a scalable algorithm to compute the overlap
2643 
2644   Collective
2645 
2646   Input Parameters:
2647 + A  - the matrix
2648 - sc - `PETSC_TRUE` indicates use the scalable algorithm (default is not to use the scalable algorithm)
2649 
2650   Level: advanced
2651 
2652 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2653 @*/
2654 PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap(Mat A, PetscBool sc)
2655 {
2656   PetscFunctionBegin;
2657   PetscTryMethod(A, "MatMPIAIJSetUseScalableIncreaseOverlap_C", (Mat, PetscBool), (A, sc));
2658   PetscFunctionReturn(PETSC_SUCCESS);
2659 }
2660 
2661 PetscErrorCode MatSetFromOptions_MPIAIJ(Mat A, PetscOptionItems PetscOptionsObject)
2662 {
2663   PetscBool sc = PETSC_FALSE, flg;
2664 
2665   PetscFunctionBegin;
2666   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJ options");
2667   if (A->ops->increaseoverlap == MatIncreaseOverlap_MPIAIJ_Scalable) sc = PETSC_TRUE;
2668   PetscCall(PetscOptionsBool("-mat_increase_overlap_scalable", "Use a scalable algorithm to compute the overlap", "MatIncreaseOverlap", sc, &sc, &flg));
2669   if (flg) PetscCall(MatMPIAIJSetUseScalableIncreaseOverlap(A, sc));
2670   PetscOptionsHeadEnd();
2671   PetscFunctionReturn(PETSC_SUCCESS);
2672 }
2673 
2674 static PetscErrorCode MatShift_MPIAIJ(Mat Y, PetscScalar a)
2675 {
2676   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)Y->data;
2677   Mat_SeqAIJ *aij  = (Mat_SeqAIJ *)maij->A->data;
2678 
2679   PetscFunctionBegin;
2680   if (!Y->preallocated) {
2681     PetscCall(MatMPIAIJSetPreallocation(Y, 1, NULL, 0, NULL));
2682   } else if (!aij->nz) { /* It does not matter if diagonals of Y only partially lie in maij->A. We just need an estimated preallocation. */
2683     PetscInt nonew = aij->nonew;
2684     PetscCall(MatSeqAIJSetPreallocation(maij->A, 1, NULL));
2685     aij->nonew = nonew;
2686   }
2687   PetscCall(MatShift_Basic(Y, a));
2688   PetscFunctionReturn(PETSC_SUCCESS);
2689 }
2690 
2691 static PetscErrorCode MatInvertVariableBlockDiagonal_MPIAIJ(Mat A, PetscInt nblocks, const PetscInt *bsizes, PetscScalar *diag)
2692 {
2693   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2694 
2695   PetscFunctionBegin;
2696   PetscCall(MatInvertVariableBlockDiagonal(a->A, nblocks, bsizes, diag));
2697   PetscFunctionReturn(PETSC_SUCCESS);
2698 }
2699 
2700 static PetscErrorCode MatEliminateZeros_MPIAIJ(Mat A, PetscBool keep)
2701 {
2702   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2703 
2704   PetscFunctionBegin;
2705   PetscCall(MatEliminateZeros_SeqAIJ(a->A, keep));        // possibly keep zero diagonal coefficients
2706   PetscCall(MatEliminateZeros_SeqAIJ(a->B, PETSC_FALSE)); // never keep zero diagonal coefficients
2707   PetscFunctionReturn(PETSC_SUCCESS);
2708 }
2709 
2710 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
2711                                        MatGetRow_MPIAIJ,
2712                                        MatRestoreRow_MPIAIJ,
2713                                        MatMult_MPIAIJ,
2714                                        /* 4*/ MatMultAdd_MPIAIJ,
2715                                        MatMultTranspose_MPIAIJ,
2716                                        MatMultTransposeAdd_MPIAIJ,
2717                                        NULL,
2718                                        NULL,
2719                                        NULL,
2720                                        /*10*/ NULL,
2721                                        NULL,
2722                                        NULL,
2723                                        MatSOR_MPIAIJ,
2724                                        MatTranspose_MPIAIJ,
2725                                        /*15*/ MatGetInfo_MPIAIJ,
2726                                        MatEqual_MPIAIJ,
2727                                        MatGetDiagonal_MPIAIJ,
2728                                        MatDiagonalScale_MPIAIJ,
2729                                        MatNorm_MPIAIJ,
2730                                        /*20*/ MatAssemblyBegin_MPIAIJ,
2731                                        MatAssemblyEnd_MPIAIJ,
2732                                        MatSetOption_MPIAIJ,
2733                                        MatZeroEntries_MPIAIJ,
2734                                        /*24*/ MatZeroRows_MPIAIJ,
2735                                        NULL,
2736                                        NULL,
2737                                        NULL,
2738                                        NULL,
2739                                        /*29*/ MatSetUp_MPI_Hash,
2740                                        NULL,
2741                                        NULL,
2742                                        MatGetDiagonalBlock_MPIAIJ,
2743                                        NULL,
2744                                        /*34*/ MatDuplicate_MPIAIJ,
2745                                        NULL,
2746                                        NULL,
2747                                        NULL,
2748                                        NULL,
2749                                        /*39*/ MatAXPY_MPIAIJ,
2750                                        MatCreateSubMatrices_MPIAIJ,
2751                                        MatIncreaseOverlap_MPIAIJ,
2752                                        MatGetValues_MPIAIJ,
2753                                        MatCopy_MPIAIJ,
2754                                        /*44*/ MatGetRowMax_MPIAIJ,
2755                                        MatScale_MPIAIJ,
2756                                        MatShift_MPIAIJ,
2757                                        MatDiagonalSet_MPIAIJ,
2758                                        MatZeroRowsColumns_MPIAIJ,
2759                                        /*49*/ MatSetRandom_MPIAIJ,
2760                                        MatGetRowIJ_MPIAIJ,
2761                                        MatRestoreRowIJ_MPIAIJ,
2762                                        NULL,
2763                                        NULL,
2764                                        /*54*/ MatFDColoringCreate_MPIXAIJ,
2765                                        NULL,
2766                                        MatSetUnfactored_MPIAIJ,
2767                                        MatPermute_MPIAIJ,
2768                                        NULL,
2769                                        /*59*/ MatCreateSubMatrix_MPIAIJ,
2770                                        MatDestroy_MPIAIJ,
2771                                        MatView_MPIAIJ,
2772                                        NULL,
2773                                        NULL,
2774                                        /*64*/ MatMatMatMultNumeric_MPIAIJ_MPIAIJ_MPIAIJ,
2775                                        NULL,
2776                                        NULL,
2777                                        NULL,
2778                                        MatGetRowMaxAbs_MPIAIJ,
2779                                        /*69*/ MatGetRowMinAbs_MPIAIJ,
2780                                        NULL,
2781                                        NULL,
2782                                        MatFDColoringApply_AIJ,
2783                                        MatSetFromOptions_MPIAIJ,
2784                                        MatFindZeroDiagonals_MPIAIJ,
2785                                        /*75*/ NULL,
2786                                        NULL,
2787                                        NULL,
2788                                        MatLoad_MPIAIJ,
2789                                        NULL,
2790                                        /*80*/ NULL,
2791                                        NULL,
2792                                        NULL,
2793                                        /*83*/ NULL,
2794                                        NULL,
2795                                        MatMatMultNumeric_MPIAIJ_MPIAIJ,
2796                                        MatPtAPNumeric_MPIAIJ_MPIAIJ,
2797                                        NULL,
2798                                        NULL,
2799                                        /*89*/ MatBindToCPU_MPIAIJ,
2800                                        MatProductSetFromOptions_MPIAIJ,
2801                                        NULL,
2802                                        NULL,
2803                                        MatConjugate_MPIAIJ,
2804                                        /*94*/ NULL,
2805                                        MatSetValuesRow_MPIAIJ,
2806                                        MatRealPart_MPIAIJ,
2807                                        MatImaginaryPart_MPIAIJ,
2808                                        NULL,
2809                                        /*99*/ NULL,
2810                                        NULL,
2811                                        NULL,
2812                                        MatGetRowMin_MPIAIJ,
2813                                        NULL,
2814                                        /*104*/ MatGetSeqNonzeroStructure_MPIAIJ,
2815                                        NULL,
2816                                        MatGetGhosts_MPIAIJ,
2817                                        NULL,
2818                                        NULL,
2819                                        /*109*/ MatMultDiagonalBlock_MPIAIJ,
2820                                        NULL,
2821                                        NULL,
2822                                        NULL,
2823                                        MatGetMultiProcBlock_MPIAIJ,
2824                                        /*114*/ MatFindNonzeroRows_MPIAIJ,
2825                                        MatGetColumnReductions_MPIAIJ,
2826                                        MatInvertBlockDiagonal_MPIAIJ,
2827                                        MatInvertVariableBlockDiagonal_MPIAIJ,
2828                                        MatCreateSubMatricesMPI_MPIAIJ,
2829                                        /*119*/ NULL,
2830                                        NULL,
2831                                        MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ,
2832                                        NULL,
2833                                        NULL,
2834                                        /*124*/ NULL,
2835                                        NULL,
2836                                        MatSetBlockSizes_MPIAIJ,
2837                                        NULL,
2838                                        MatFDColoringSetUp_MPIXAIJ,
2839                                        /*129*/ MatFindOffBlockDiagonalEntries_MPIAIJ,
2840                                        MatCreateMPIMatConcatenateSeqMat_MPIAIJ,
2841                                        NULL,
2842                                        NULL,
2843                                        NULL,
2844                                        /*134*/ MatCreateGraph_Simple_AIJ,
2845                                        NULL,
2846                                        MatEliminateZeros_MPIAIJ,
2847                                        MatGetRowSumAbs_MPIAIJ,
2848                                        NULL,
2849                                        /*139*/ NULL,
2850                                        NULL,
2851                                        MatCopyHashToXAIJ_MPI_Hash,
2852                                        MatGetCurrentMemType_MPIAIJ,
2853                                        NULL};
2854 
2855 static PetscErrorCode MatStoreValues_MPIAIJ(Mat mat)
2856 {
2857   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2858 
2859   PetscFunctionBegin;
2860   PetscCall(MatStoreValues(aij->A));
2861   PetscCall(MatStoreValues(aij->B));
2862   PetscFunctionReturn(PETSC_SUCCESS);
2863 }
2864 
2865 static PetscErrorCode MatRetrieveValues_MPIAIJ(Mat mat)
2866 {
2867   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2868 
2869   PetscFunctionBegin;
2870   PetscCall(MatRetrieveValues(aij->A));
2871   PetscCall(MatRetrieveValues(aij->B));
2872   PetscFunctionReturn(PETSC_SUCCESS);
2873 }
2874 
2875 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJ(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
2876 {
2877   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2878   PetscMPIInt size;
2879 
2880   PetscFunctionBegin;
2881   if (B->hash_active) {
2882     B->ops[0]      = b->cops;
2883     B->hash_active = PETSC_FALSE;
2884   }
2885   PetscCall(PetscLayoutSetUp(B->rmap));
2886   PetscCall(PetscLayoutSetUp(B->cmap));
2887 
2888 #if defined(PETSC_USE_CTABLE)
2889   PetscCall(PetscHMapIDestroy(&b->colmap));
2890 #else
2891   PetscCall(PetscFree(b->colmap));
2892 #endif
2893   PetscCall(PetscFree(b->garray));
2894   PetscCall(VecDestroy(&b->lvec));
2895   PetscCall(VecScatterDestroy(&b->Mvctx));
2896 
2897   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2898 
2899   MatSeqXAIJGetOptions_Private(b->B);
2900   PetscCall(MatDestroy(&b->B));
2901   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2902   PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
2903   PetscCall(MatSetBlockSizesFromMats(b->B, B, B));
2904   PetscCall(MatSetType(b->B, MATSEQAIJ));
2905   MatSeqXAIJRestoreOptions_Private(b->B);
2906 
2907   MatSeqXAIJGetOptions_Private(b->A);
2908   PetscCall(MatDestroy(&b->A));
2909   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2910   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2911   PetscCall(MatSetBlockSizesFromMats(b->A, B, B));
2912   PetscCall(MatSetType(b->A, MATSEQAIJ));
2913   MatSeqXAIJRestoreOptions_Private(b->A);
2914 
2915   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
2916   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
2917   B->preallocated  = PETSC_TRUE;
2918   B->was_assembled = PETSC_FALSE;
2919   B->assembled     = PETSC_FALSE;
2920   PetscFunctionReturn(PETSC_SUCCESS);
2921 }
2922 
2923 static PetscErrorCode MatResetPreallocation_MPIAIJ(Mat B)
2924 {
2925   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2926   PetscBool   ondiagreset, offdiagreset, memoryreset;
2927 
2928   PetscFunctionBegin;
2929   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
2930   PetscCheck(B->insertmode == NOT_SET_VALUES, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot reset preallocation after setting some values but not yet calling MatAssemblyBegin()/MatAssemblyEnd()");
2931   if (B->num_ass == 0) PetscFunctionReturn(PETSC_SUCCESS);
2932 
2933   PetscCall(MatResetPreallocation_SeqAIJ_Private(b->A, &ondiagreset));
2934   PetscCall(MatResetPreallocation_SeqAIJ_Private(b->B, &offdiagreset));
2935   memoryreset = (PetscBool)(ondiagreset || offdiagreset);
2936   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &memoryreset, 1, MPI_C_BOOL, MPI_LOR, PetscObjectComm((PetscObject)B)));
2937   if (!memoryreset) PetscFunctionReturn(PETSC_SUCCESS);
2938 
2939   PetscCall(PetscLayoutSetUp(B->rmap));
2940   PetscCall(PetscLayoutSetUp(B->cmap));
2941   PetscCheck(B->assembled || B->was_assembled, PetscObjectComm((PetscObject)B), PETSC_ERR_ARG_WRONGSTATE, "Should not need to reset preallocation if the matrix was never assembled");
2942   PetscCall(MatDisAssemble_MPIAIJ(B, PETSC_TRUE));
2943   PetscCall(VecScatterDestroy(&b->Mvctx));
2944 
2945   B->preallocated  = PETSC_TRUE;
2946   B->was_assembled = PETSC_FALSE;
2947   B->assembled     = PETSC_FALSE;
2948   /* Log that the state of this object has changed; this will help guarantee that preconditioners get re-setup */
2949   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2950   PetscFunctionReturn(PETSC_SUCCESS);
2951 }
2952 
2953 PetscErrorCode MatDuplicate_MPIAIJ(Mat matin, MatDuplicateOption cpvalues, Mat *newmat)
2954 {
2955   Mat         mat;
2956   Mat_MPIAIJ *a, *oldmat = (Mat_MPIAIJ *)matin->data;
2957 
2958   PetscFunctionBegin;
2959   *newmat = NULL;
2960   PetscCall(MatCreate(PetscObjectComm((PetscObject)matin), &mat));
2961   PetscCall(MatSetSizes(mat, matin->rmap->n, matin->cmap->n, matin->rmap->N, matin->cmap->N));
2962   PetscCall(MatSetBlockSizesFromMats(mat, matin, matin));
2963   PetscCall(MatSetType(mat, ((PetscObject)matin)->type_name));
2964   a = (Mat_MPIAIJ *)mat->data;
2965 
2966   mat->factortype = matin->factortype;
2967   mat->assembled  = matin->assembled;
2968   mat->insertmode = NOT_SET_VALUES;
2969 
2970   a->size         = oldmat->size;
2971   a->rank         = oldmat->rank;
2972   a->donotstash   = oldmat->donotstash;
2973   a->roworiented  = oldmat->roworiented;
2974   a->rowindices   = NULL;
2975   a->rowvalues    = NULL;
2976   a->getrowactive = PETSC_FALSE;
2977 
2978   PetscCall(PetscLayoutReference(matin->rmap, &mat->rmap));
2979   PetscCall(PetscLayoutReference(matin->cmap, &mat->cmap));
2980   if (matin->hash_active) {
2981     PetscCall(MatSetUp(mat));
2982   } else {
2983     mat->preallocated = matin->preallocated;
2984     if (oldmat->colmap) {
2985 #if defined(PETSC_USE_CTABLE)
2986       PetscCall(PetscHMapIDuplicate(oldmat->colmap, &a->colmap));
2987 #else
2988       PetscCall(PetscMalloc1(mat->cmap->N, &a->colmap));
2989       PetscCall(PetscArraycpy(a->colmap, oldmat->colmap, mat->cmap->N));
2990 #endif
2991     } else a->colmap = NULL;
2992     if (oldmat->garray) {
2993       PetscInt len;
2994       len = oldmat->B->cmap->n;
2995       PetscCall(PetscMalloc1(len, &a->garray));
2996       if (len) PetscCall(PetscArraycpy(a->garray, oldmat->garray, len));
2997     } else a->garray = NULL;
2998 
2999     /* It may happen MatDuplicate is called with a non-assembled matrix
3000       In fact, MatDuplicate only requires the matrix to be preallocated
3001       This may happen inside a DMCreateMatrix_Shell */
3002     if (oldmat->lvec) PetscCall(VecDuplicate(oldmat->lvec, &a->lvec));
3003     if (oldmat->Mvctx) {
3004       a->Mvctx = oldmat->Mvctx;
3005       PetscCall(PetscObjectReference((PetscObject)oldmat->Mvctx));
3006     }
3007     PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
3008     PetscCall(MatDuplicate(oldmat->B, cpvalues, &a->B));
3009   }
3010   PetscCall(PetscFunctionListDuplicate(((PetscObject)matin)->qlist, &((PetscObject)mat)->qlist));
3011   *newmat = mat;
3012   PetscFunctionReturn(PETSC_SUCCESS);
3013 }
3014 
3015 PetscErrorCode MatLoad_MPIAIJ(Mat newMat, PetscViewer viewer)
3016 {
3017   PetscBool isbinary, ishdf5;
3018 
3019   PetscFunctionBegin;
3020   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
3021   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
3022   /* force binary viewer to load .info file if it has not yet done so */
3023   PetscCall(PetscViewerSetUp(viewer));
3024   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3025   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
3026   if (isbinary) {
3027     PetscCall(MatLoad_MPIAIJ_Binary(newMat, viewer));
3028   } else if (ishdf5) {
3029 #if defined(PETSC_HAVE_HDF5)
3030     PetscCall(MatLoad_AIJ_HDF5(newMat, viewer));
3031 #else
3032     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "HDF5 not supported in this build.\nPlease reconfigure using --download-hdf5");
3033 #endif
3034   } else {
3035     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
3036   }
3037   PetscFunctionReturn(PETSC_SUCCESS);
3038 }
3039 
3040 PetscErrorCode MatLoad_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
3041 {
3042   PetscInt     header[4], M, N, m, nz, rows, cols, sum, i;
3043   PetscInt    *rowidxs, *colidxs;
3044   PetscScalar *matvals;
3045 
3046   PetscFunctionBegin;
3047   PetscCall(PetscViewerSetUp(viewer));
3048 
3049   /* read in matrix header */
3050   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3051   PetscCheck(header[0] == MAT_FILE_CLASSID, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3052   M  = header[1];
3053   N  = header[2];
3054   nz = header[3];
3055   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3056   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3057   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as MPIAIJ");
3058 
3059   /* set block sizes from the viewer's .info file */
3060   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3061   /* set global sizes if not set already */
3062   if (mat->rmap->N < 0) mat->rmap->N = M;
3063   if (mat->cmap->N < 0) mat->cmap->N = N;
3064   PetscCall(PetscLayoutSetUp(mat->rmap));
3065   PetscCall(PetscLayoutSetUp(mat->cmap));
3066 
3067   /* check if the matrix sizes are correct */
3068   PetscCall(MatGetSize(mat, &rows, &cols));
3069   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3070 
3071   /* read in row lengths and build row indices */
3072   PetscCall(MatGetLocalSize(mat, &m, NULL));
3073   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3074   PetscCall(PetscViewerBinaryReadAll(viewer, rowidxs + 1, m, PETSC_DECIDE, M, PETSC_INT));
3075   rowidxs[0] = 0;
3076   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3077   if (nz != PETSC_INT_MAX) {
3078     PetscCallMPI(MPIU_Allreduce(&rowidxs[m], &sum, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)viewer)));
3079     PetscCheck(sum == nz, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3080   }
3081 
3082   /* read in column indices and matrix values */
3083   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, rowidxs[m], &matvals));
3084   PetscCall(PetscViewerBinaryReadAll(viewer, colidxs, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
3085   PetscCall(PetscViewerBinaryReadAll(viewer, matvals, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
3086   /* store matrix indices and values */
3087   PetscCall(MatMPIAIJSetPreallocationCSR(mat, rowidxs, colidxs, matvals));
3088   PetscCall(PetscFree(rowidxs));
3089   PetscCall(PetscFree2(colidxs, matvals));
3090   PetscFunctionReturn(PETSC_SUCCESS);
3091 }
3092 
3093 /* Not scalable because of ISAllGather() unless getting all columns. */
3094 static PetscErrorCode ISGetSeqIS_Private(Mat mat, IS iscol, IS *isseq)
3095 {
3096   IS          iscol_local;
3097   PetscBool   isstride;
3098   PetscMPIInt gisstride = 0;
3099 
3100   PetscFunctionBegin;
3101   /* check if we are grabbing all columns*/
3102   PetscCall(PetscObjectTypeCompare((PetscObject)iscol, ISSTRIDE, &isstride));
3103 
3104   if (isstride) {
3105     PetscInt start, len, mstart, mlen;
3106     PetscCall(ISStrideGetInfo(iscol, &start, NULL));
3107     PetscCall(ISGetLocalSize(iscol, &len));
3108     PetscCall(MatGetOwnershipRangeColumn(mat, &mstart, &mlen));
3109     if (mstart == start && mlen - mstart == len) gisstride = 1;
3110   }
3111 
3112   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &gisstride, 1, MPI_INT, MPI_MIN, PetscObjectComm((PetscObject)mat)));
3113   if (gisstride) {
3114     PetscInt N;
3115     PetscCall(MatGetSize(mat, NULL, &N));
3116     PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &iscol_local));
3117     PetscCall(ISSetIdentity(iscol_local));
3118     PetscCall(PetscInfo(mat, "Optimizing for obtaining all columns of the matrix; skipping ISAllGather()\n"));
3119   } else {
3120     PetscInt cbs;
3121     PetscCall(ISGetBlockSize(iscol, &cbs));
3122     PetscCall(ISAllGather(iscol, &iscol_local));
3123     PetscCall(ISSetBlockSize(iscol_local, cbs));
3124   }
3125 
3126   *isseq = iscol_local;
3127   PetscFunctionReturn(PETSC_SUCCESS);
3128 }
3129 
3130 /*
3131  Used by MatCreateSubMatrix_MPIAIJ_SameRowColDist() to avoid ISAllGather() and global size of iscol_local
3132  (see MatCreateSubMatrix_MPIAIJ_nonscalable)
3133 
3134  Input Parameters:
3135 +   mat - matrix
3136 .   isrow - parallel row index set; its local indices are a subset of local columns of `mat`,
3137            i.e., mat->rstart <= isrow[i] < mat->rend
3138 -   iscol - parallel column index set; its local indices are a subset of local columns of `mat`,
3139            i.e., mat->cstart <= iscol[i] < mat->cend
3140 
3141  Output Parameters:
3142 +   isrow_d - sequential row index set for retrieving mat->A
3143 .   iscol_d - sequential  column index set for retrieving mat->A
3144 .   iscol_o - sequential column index set for retrieving mat->B
3145 -   garray - column map; garray[i] indicates global location of iscol_o[i] in `iscol`
3146  */
3147 static PetscErrorCode ISGetSeqIS_SameColDist_Private(Mat mat, IS isrow, IS iscol, IS *isrow_d, IS *iscol_d, IS *iscol_o, PetscInt *garray[])
3148 {
3149   Vec             x, cmap;
3150   const PetscInt *is_idx;
3151   PetscScalar    *xarray, *cmaparray;
3152   PetscInt        ncols, isstart, *idx, m, rstart, *cmap1, count;
3153   Mat_MPIAIJ     *a    = (Mat_MPIAIJ *)mat->data;
3154   Mat             B    = a->B;
3155   Vec             lvec = a->lvec, lcmap;
3156   PetscInt        i, cstart, cend, Bn = B->cmap->N;
3157   MPI_Comm        comm;
3158   VecScatter      Mvctx = a->Mvctx;
3159 
3160   PetscFunctionBegin;
3161   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3162   PetscCall(ISGetLocalSize(iscol, &ncols));
3163 
3164   /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */
3165   PetscCall(MatCreateVecs(mat, &x, NULL));
3166   PetscCall(VecSet(x, -1.0));
3167   PetscCall(VecDuplicate(x, &cmap));
3168   PetscCall(VecSet(cmap, -1.0));
3169 
3170   /* Get start indices */
3171   PetscCallMPI(MPI_Scan(&ncols, &isstart, 1, MPIU_INT, MPI_SUM, comm));
3172   isstart -= ncols;
3173   PetscCall(MatGetOwnershipRangeColumn(mat, &cstart, &cend));
3174 
3175   PetscCall(ISGetIndices(iscol, &is_idx));
3176   PetscCall(VecGetArray(x, &xarray));
3177   PetscCall(VecGetArray(cmap, &cmaparray));
3178   PetscCall(PetscMalloc1(ncols, &idx));
3179   for (i = 0; i < ncols; i++) {
3180     xarray[is_idx[i] - cstart]    = (PetscScalar)is_idx[i];
3181     cmaparray[is_idx[i] - cstart] = i + isstart;        /* global index of iscol[i] */
3182     idx[i]                        = is_idx[i] - cstart; /* local index of iscol[i]  */
3183   }
3184   PetscCall(VecRestoreArray(x, &xarray));
3185   PetscCall(VecRestoreArray(cmap, &cmaparray));
3186   PetscCall(ISRestoreIndices(iscol, &is_idx));
3187 
3188   /* Get iscol_d */
3189   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, iscol_d));
3190   PetscCall(ISGetBlockSize(iscol, &i));
3191   PetscCall(ISSetBlockSize(*iscol_d, i));
3192 
3193   /* Get isrow_d */
3194   PetscCall(ISGetLocalSize(isrow, &m));
3195   rstart = mat->rmap->rstart;
3196   PetscCall(PetscMalloc1(m, &idx));
3197   PetscCall(ISGetIndices(isrow, &is_idx));
3198   for (i = 0; i < m; i++) idx[i] = is_idx[i] - rstart;
3199   PetscCall(ISRestoreIndices(isrow, &is_idx));
3200 
3201   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, m, idx, PETSC_OWN_POINTER, isrow_d));
3202   PetscCall(ISGetBlockSize(isrow, &i));
3203   PetscCall(ISSetBlockSize(*isrow_d, i));
3204 
3205   /* (2) Scatter x and cmap using aij->Mvctx to get their off-process portions (see MatMult_MPIAIJ) */
3206   PetscCall(VecScatterBegin(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3207   PetscCall(VecScatterEnd(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3208 
3209   PetscCall(VecDuplicate(lvec, &lcmap));
3210 
3211   PetscCall(VecScatterBegin(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3212   PetscCall(VecScatterEnd(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3213 
3214   /* (3) create sequential iscol_o (a subset of iscol) and isgarray */
3215   /* off-process column indices */
3216   count = 0;
3217   PetscCall(PetscMalloc1(Bn, &idx));
3218   PetscCall(PetscMalloc1(Bn, &cmap1));
3219 
3220   PetscCall(VecGetArray(lvec, &xarray));
3221   PetscCall(VecGetArray(lcmap, &cmaparray));
3222   for (i = 0; i < Bn; i++) {
3223     if (PetscRealPart(xarray[i]) > -1.0) {
3224       idx[count]   = i;                                     /* local column index in off-diagonal part B */
3225       cmap1[count] = (PetscInt)PetscRealPart(cmaparray[i]); /* column index in submat */
3226       count++;
3227     }
3228   }
3229   PetscCall(VecRestoreArray(lvec, &xarray));
3230   PetscCall(VecRestoreArray(lcmap, &cmaparray));
3231 
3232   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_COPY_VALUES, iscol_o));
3233   /* cannot ensure iscol_o has same blocksize as iscol! */
3234 
3235   PetscCall(PetscFree(idx));
3236   *garray = cmap1;
3237 
3238   PetscCall(VecDestroy(&x));
3239   PetscCall(VecDestroy(&cmap));
3240   PetscCall(VecDestroy(&lcmap));
3241   PetscFunctionReturn(PETSC_SUCCESS);
3242 }
3243 
3244 /* isrow and iscol have same processor distribution as mat, output *submat is a submatrix of local mat */
3245 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowColDist(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *submat)
3246 {
3247   Mat_MPIAIJ *a = (Mat_MPIAIJ *)mat->data, *asub;
3248   Mat         M = NULL;
3249   MPI_Comm    comm;
3250   IS          iscol_d, isrow_d, iscol_o;
3251   Mat         Asub = NULL, Bsub = NULL;
3252   PetscInt    n, count, M_size, N_size;
3253 
3254   PetscFunctionBegin;
3255   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3256 
3257   if (call == MAT_REUSE_MATRIX) {
3258     /* Retrieve isrow_d, iscol_d and iscol_o from submat */
3259     PetscCall(PetscObjectQuery((PetscObject)*submat, "isrow_d", (PetscObject *)&isrow_d));
3260     PetscCheck(isrow_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "isrow_d passed in was not used before, cannot reuse");
3261 
3262     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_d", (PetscObject *)&iscol_d));
3263     PetscCheck(iscol_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_d passed in was not used before, cannot reuse");
3264 
3265     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_o", (PetscObject *)&iscol_o));
3266     PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse");
3267 
3268     /* Update diagonal and off-diagonal portions of submat */
3269     asub = (Mat_MPIAIJ *)(*submat)->data;
3270     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->A));
3271     PetscCall(ISGetLocalSize(iscol_o, &n));
3272     if (n) PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->B));
3273     PetscCall(MatAssemblyBegin(*submat, MAT_FINAL_ASSEMBLY));
3274     PetscCall(MatAssemblyEnd(*submat, MAT_FINAL_ASSEMBLY));
3275 
3276   } else { /* call == MAT_INITIAL_MATRIX) */
3277     PetscInt *garray, *garray_compact;
3278     PetscInt  BsubN;
3279 
3280     /* Create isrow_d, iscol_d, iscol_o and isgarray (replace isgarray with array?) */
3281     PetscCall(ISGetSeqIS_SameColDist_Private(mat, isrow, iscol, &isrow_d, &iscol_d, &iscol_o, &garray));
3282 
3283     /* Create local submatrices Asub and Bsub */
3284     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Asub));
3285     PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Bsub));
3286 
3287     // Compact garray so its not of size Bn
3288     PetscCall(ISGetSize(iscol_o, &count));
3289     PetscCall(PetscMalloc1(count, &garray_compact));
3290     PetscCall(PetscArraycpy(garray_compact, garray, count));
3291 
3292     /* Create submatrix M */
3293     PetscCall(ISGetSize(isrow, &M_size));
3294     PetscCall(ISGetSize(iscol, &N_size));
3295     PetscCall(MatCreateMPIAIJWithSeqAIJ(comm, M_size, N_size, Asub, Bsub, garray_compact, &M));
3296 
3297     /* If Bsub has empty columns, compress iscol_o such that it will retrieve condensed Bsub from a->B during reuse */
3298     asub = (Mat_MPIAIJ *)M->data;
3299 
3300     PetscCall(ISGetLocalSize(iscol_o, &BsubN));
3301     n = asub->B->cmap->N;
3302     if (BsubN > n) {
3303       /* This case can be tested using ~petsc/src/tao/bound/tutorials/runplate2_3 */
3304       const PetscInt *idx;
3305       PetscInt        i, j, *idx_new, *subgarray = asub->garray;
3306       PetscCall(PetscInfo(M, "submatrix Bn %" PetscInt_FMT " != BsubN %" PetscInt_FMT ", update iscol_o\n", n, BsubN));
3307 
3308       PetscCall(PetscMalloc1(n, &idx_new));
3309       j = 0;
3310       PetscCall(ISGetIndices(iscol_o, &idx));
3311       for (i = 0; i < n; i++) {
3312         if (j >= BsubN) break;
3313         while (subgarray[i] > garray[j]) j++;
3314 
3315         PetscCheck(subgarray[i] == garray[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "subgarray[%" PetscInt_FMT "]=%" PetscInt_FMT " cannot < garray[%" PetscInt_FMT "]=%" PetscInt_FMT, i, subgarray[i], j, garray[j]);
3316         idx_new[i] = idx[j++];
3317       }
3318       PetscCall(ISRestoreIndices(iscol_o, &idx));
3319 
3320       PetscCall(ISDestroy(&iscol_o));
3321       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, idx_new, PETSC_OWN_POINTER, &iscol_o));
3322 
3323     } else PetscCheck(BsubN >= n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Columns of Bsub (%" PetscInt_FMT ") cannot be smaller than B's (%" PetscInt_FMT ")", BsubN, asub->B->cmap->N);
3324 
3325     PetscCall(PetscFree(garray));
3326     *submat = M;
3327 
3328     /* Save isrow_d, iscol_d and iscol_o used in processor for next request */
3329     PetscCall(PetscObjectCompose((PetscObject)M, "isrow_d", (PetscObject)isrow_d));
3330     PetscCall(ISDestroy(&isrow_d));
3331 
3332     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_d", (PetscObject)iscol_d));
3333     PetscCall(ISDestroy(&iscol_d));
3334 
3335     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_o", (PetscObject)iscol_o));
3336     PetscCall(ISDestroy(&iscol_o));
3337   }
3338   PetscFunctionReturn(PETSC_SUCCESS);
3339 }
3340 
3341 PetscErrorCode MatCreateSubMatrix_MPIAIJ(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *newmat)
3342 {
3343   IS        iscol_local = NULL, isrow_d;
3344   PetscInt  csize;
3345   PetscInt  n, i, j, start, end;
3346   PetscBool sameRowDist = PETSC_FALSE, sameDist[2], tsameDist[2];
3347   MPI_Comm  comm;
3348 
3349   PetscFunctionBegin;
3350   /* If isrow has same processor distribution as mat,
3351      call MatCreateSubMatrix_MPIAIJ_SameRowDist() to avoid using a hash table with global size of iscol */
3352   if (call == MAT_REUSE_MATRIX) {
3353     PetscCall(PetscObjectQuery((PetscObject)*newmat, "isrow_d", (PetscObject *)&isrow_d));
3354     if (isrow_d) {
3355       sameRowDist  = PETSC_TRUE;
3356       tsameDist[1] = PETSC_TRUE; /* sameColDist */
3357     } else {
3358       PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_local));
3359       if (iscol_local) {
3360         sameRowDist  = PETSC_TRUE;
3361         tsameDist[1] = PETSC_FALSE; /* !sameColDist */
3362       }
3363     }
3364   } else {
3365     /* Check if isrow has same processor distribution as mat */
3366     sameDist[0] = PETSC_FALSE;
3367     PetscCall(ISGetLocalSize(isrow, &n));
3368     if (!n) {
3369       sameDist[0] = PETSC_TRUE;
3370     } else {
3371       PetscCall(ISGetMinMax(isrow, &i, &j));
3372       PetscCall(MatGetOwnershipRange(mat, &start, &end));
3373       if (i >= start && j < end) sameDist[0] = PETSC_TRUE;
3374     }
3375 
3376     /* Check if iscol has same processor distribution as mat */
3377     sameDist[1] = PETSC_FALSE;
3378     PetscCall(ISGetLocalSize(iscol, &n));
3379     if (!n) {
3380       sameDist[1] = PETSC_TRUE;
3381     } else {
3382       PetscCall(ISGetMinMax(iscol, &i, &j));
3383       PetscCall(MatGetOwnershipRangeColumn(mat, &start, &end));
3384       if (i >= start && j < end) sameDist[1] = PETSC_TRUE;
3385     }
3386 
3387     PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3388     PetscCallMPI(MPIU_Allreduce(&sameDist, &tsameDist, 2, MPI_C_BOOL, MPI_LAND, comm));
3389     sameRowDist = tsameDist[0];
3390   }
3391 
3392   if (sameRowDist) {
3393     if (tsameDist[1]) { /* sameRowDist & sameColDist */
3394       /* isrow and iscol have same processor distribution as mat */
3395       PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowColDist(mat, isrow, iscol, call, newmat));
3396       PetscFunctionReturn(PETSC_SUCCESS);
3397     } else { /* sameRowDist */
3398       /* isrow has same processor distribution as mat */
3399       if (call == MAT_INITIAL_MATRIX) {
3400         PetscBool sorted;
3401         PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3402         PetscCall(ISGetLocalSize(iscol_local, &n)); /* local size of iscol_local = global columns of newmat */
3403         PetscCall(ISGetSize(iscol, &i));
3404         PetscCheck(n == i, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "n %" PetscInt_FMT " != size of iscol %" PetscInt_FMT, n, i);
3405 
3406         PetscCall(ISSorted(iscol_local, &sorted));
3407         if (sorted) {
3408           /* MatCreateSubMatrix_MPIAIJ_SameRowDist() requires iscol_local be sorted; it can have duplicate indices */
3409           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, iscol_local, MAT_INITIAL_MATRIX, newmat));
3410           PetscFunctionReturn(PETSC_SUCCESS);
3411         }
3412       } else { /* call == MAT_REUSE_MATRIX */
3413         IS iscol_sub;
3414         PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3415         if (iscol_sub) {
3416           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, NULL, call, newmat));
3417           PetscFunctionReturn(PETSC_SUCCESS);
3418         }
3419       }
3420     }
3421   }
3422 
3423   /* General case: iscol -> iscol_local which has global size of iscol */
3424   if (call == MAT_REUSE_MATRIX) {
3425     PetscCall(PetscObjectQuery((PetscObject)*newmat, "ISAllGather", (PetscObject *)&iscol_local));
3426     PetscCheck(iscol_local, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3427   } else {
3428     if (!iscol_local) PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3429   }
3430 
3431   PetscCall(ISGetLocalSize(iscol, &csize));
3432   PetscCall(MatCreateSubMatrix_MPIAIJ_nonscalable(mat, isrow, iscol_local, csize, call, newmat));
3433 
3434   if (call == MAT_INITIAL_MATRIX) {
3435     PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3436     PetscCall(ISDestroy(&iscol_local));
3437   }
3438   PetscFunctionReturn(PETSC_SUCCESS);
3439 }
3440 
3441 /*@C
3442   MatCreateMPIAIJWithSeqAIJ - creates a `MATMPIAIJ` matrix using `MATSEQAIJ` matrices that contain the "diagonal"
3443   and "off-diagonal" part of the matrix in CSR format.
3444 
3445   Collective
3446 
3447   Input Parameters:
3448 + comm   - MPI communicator
3449 . M      - the global row size
3450 . N      - the global column size
3451 . A      - "diagonal" portion of matrix
3452 . B      - if garray is `NULL`, B should be the offdiag matrix using global col ids and of size N - if garray is not `NULL`, B should be the offdiag matrix using local col ids and of size garray
3453 - garray - either `NULL` or the global index of `B` columns. If not `NULL`, it should be allocated by `PetscMalloc1()` and will be owned by `mat` thereafter.
3454 
3455   Output Parameter:
3456 . mat - the matrix, with input `A` as its local diagonal matrix
3457 
3458   Level: advanced
3459 
3460   Notes:
3461   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix.
3462 
3463   `A` and `B` becomes part of output mat. The user cannot use `A` and `B` anymore.
3464 
3465   If `garray` is `NULL`, `B` will be compacted to use local indices. In this sense, `B`'s sparsity pattern (nonzerostate) will be changed. If `B` is a device matrix, we need to somehow also update
3466   `B`'s copy on device.  We do so by increasing `B`'s nonzerostate. In use of `B` on device, device matrix types should detect this change (ref. internal routines `MatSeqAIJCUSPARSECopyToGPU()` or
3467   `MatAssemblyEnd_SeqAIJKokkos()`) and will just destroy and then recreate the device copy of `B`. It is not optimal, but is easy to implement and less hacky. To avoid this overhead, try to compute `garray`
3468   yourself, see algorithms in the private function `MatSetUpMultiply_MPIAIJ()`.
3469 
3470   The `NULL`-ness of `garray` doesn't need to be collective, in other words, `garray` can be `NULL` on some processes while not on others.
3471 
3472 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateMPIAIJWithSplitArrays()`
3473 @*/
3474 PetscErrorCode MatCreateMPIAIJWithSeqAIJ(MPI_Comm comm, PetscInt M, PetscInt N, Mat A, Mat B, PetscInt *garray, Mat *mat)
3475 {
3476   PetscInt    m, n;
3477   MatType     mpi_mat_type;
3478   Mat_MPIAIJ *mpiaij;
3479   Mat         C;
3480 
3481   PetscFunctionBegin;
3482   PetscCall(MatCreate(comm, &C));
3483   PetscCall(MatGetSize(A, &m, &n));
3484   PetscCheck(m == B->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Am %" PetscInt_FMT " != Bm %" PetscInt_FMT, m, B->rmap->N);
3485   PetscCheck(A->rmap->bs == B->rmap->bs, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "A row bs %" PetscInt_FMT " != B row bs %" PetscInt_FMT, A->rmap->bs, B->rmap->bs);
3486 
3487   PetscCall(MatSetSizes(C, m, n, M, N));
3488   /* Determine the type of MPI matrix that should be created from the type of matrix A, which holds the "diagonal" portion. */
3489   PetscCall(MatGetMPIMatType_Private(A, &mpi_mat_type));
3490   PetscCall(MatSetType(C, mpi_mat_type));
3491   if (!garray) {
3492     const PetscScalar *ba;
3493 
3494     B->nonzerostate++;
3495     PetscCall(MatSeqAIJGetArrayRead(B, &ba)); /* Since we will destroy B's device copy, we need to make sure the host copy is up to date */
3496     PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3497   }
3498 
3499   PetscCall(MatSetBlockSizes(C, A->rmap->bs, A->cmap->bs));
3500   PetscCall(PetscLayoutSetUp(C->rmap));
3501   PetscCall(PetscLayoutSetUp(C->cmap));
3502 
3503   mpiaij              = (Mat_MPIAIJ *)C->data;
3504   mpiaij->A           = A;
3505   mpiaij->B           = B;
3506   mpiaij->garray      = garray;
3507   C->preallocated     = PETSC_TRUE;
3508   C->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
3509 
3510   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
3511   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3512   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
3513    also gets mpiaij->B compacted (if garray is NULL), with its col ids and size reduced
3514    */
3515   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3516   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
3517   PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3518   *mat = C;
3519   PetscFunctionReturn(PETSC_SUCCESS);
3520 }
3521 
3522 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, PetscBool, Mat *);
3523 
3524 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowDist(Mat mat, IS isrow, IS iscol, IS iscol_local, MatReuse call, Mat *newmat)
3525 {
3526   PetscInt        i, m, n, rstart, row, rend, nz, j, bs, cbs;
3527   PetscInt       *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3528   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)mat->data;
3529   Mat             M, Msub, B = a->B;
3530   MatScalar      *aa;
3531   Mat_SeqAIJ     *aij;
3532   PetscInt       *garray = a->garray, *colsub, Ncols;
3533   PetscInt        count, Bn = B->cmap->N, cstart = mat->cmap->rstart, cend = mat->cmap->rend;
3534   IS              iscol_sub, iscmap;
3535   const PetscInt *is_idx, *cmap;
3536   PetscBool       allcolumns = PETSC_FALSE;
3537   MPI_Comm        comm;
3538 
3539   PetscFunctionBegin;
3540   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3541   if (call == MAT_REUSE_MATRIX) {
3542     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3543     PetscCheck(iscol_sub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "SubIScol passed in was not used before, cannot reuse");
3544     PetscCall(ISGetLocalSize(iscol_sub, &count));
3545 
3546     PetscCall(PetscObjectQuery((PetscObject)*newmat, "Subcmap", (PetscObject *)&iscmap));
3547     PetscCheck(iscmap, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Subcmap passed in was not used before, cannot reuse");
3548 
3549     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Msub));
3550     PetscCheck(Msub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3551 
3552     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_REUSE_MATRIX, PETSC_FALSE, &Msub));
3553 
3554   } else { /* call == MAT_INITIAL_MATRIX) */
3555     PetscBool flg;
3556 
3557     PetscCall(ISGetLocalSize(iscol, &n));
3558     PetscCall(ISGetSize(iscol, &Ncols));
3559 
3560     /* (1) iscol -> nonscalable iscol_local */
3561     /* Check for special case: each processor gets entire matrix columns */
3562     PetscCall(ISIdentity(iscol_local, &flg));
3563     if (flg && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3564     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3565     if (allcolumns) {
3566       iscol_sub = iscol_local;
3567       PetscCall(PetscObjectReference((PetscObject)iscol_local));
3568       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iscmap));
3569 
3570     } else {
3571       /* (2) iscol_local -> iscol_sub and iscmap. Implementation below requires iscol_local be sorted, it can have duplicate indices */
3572       PetscInt *idx, *cmap1, k;
3573       PetscCall(PetscMalloc1(Ncols, &idx));
3574       PetscCall(PetscMalloc1(Ncols, &cmap1));
3575       PetscCall(ISGetIndices(iscol_local, &is_idx));
3576       count = 0;
3577       k     = 0;
3578       for (i = 0; i < Ncols; i++) {
3579         j = is_idx[i];
3580         if (j >= cstart && j < cend) {
3581           /* diagonal part of mat */
3582           idx[count]     = j;
3583           cmap1[count++] = i; /* column index in submat */
3584         } else if (Bn) {
3585           /* off-diagonal part of mat */
3586           if (j == garray[k]) {
3587             idx[count]     = j;
3588             cmap1[count++] = i; /* column index in submat */
3589           } else if (j > garray[k]) {
3590             while (j > garray[k] && k < Bn - 1) k++;
3591             if (j == garray[k]) {
3592               idx[count]     = j;
3593               cmap1[count++] = i; /* column index in submat */
3594             }
3595           }
3596         }
3597       }
3598       PetscCall(ISRestoreIndices(iscol_local, &is_idx));
3599 
3600       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_OWN_POINTER, &iscol_sub));
3601       PetscCall(ISGetBlockSize(iscol, &cbs));
3602       PetscCall(ISSetBlockSize(iscol_sub, cbs));
3603 
3604       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)iscol_local), count, cmap1, PETSC_OWN_POINTER, &iscmap));
3605     }
3606 
3607     /* (3) Create sequential Msub */
3608     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_INITIAL_MATRIX, allcolumns, &Msub));
3609   }
3610 
3611   PetscCall(ISGetLocalSize(iscol_sub, &count));
3612   aij = (Mat_SeqAIJ *)Msub->data;
3613   ii  = aij->i;
3614   PetscCall(ISGetIndices(iscmap, &cmap));
3615 
3616   /*
3617       m - number of local rows
3618       Ncols - number of columns (same on all processors)
3619       rstart - first row in new global matrix generated
3620   */
3621   PetscCall(MatGetSize(Msub, &m, NULL));
3622 
3623   if (call == MAT_INITIAL_MATRIX) {
3624     /* (4) Create parallel newmat */
3625     PetscMPIInt rank, size;
3626     PetscInt    csize;
3627 
3628     PetscCallMPI(MPI_Comm_size(comm, &size));
3629     PetscCallMPI(MPI_Comm_rank(comm, &rank));
3630 
3631     /*
3632         Determine the number of non-zeros in the diagonal and off-diagonal
3633         portions of the matrix in order to do correct preallocation
3634     */
3635 
3636     /* first get start and end of "diagonal" columns */
3637     PetscCall(ISGetLocalSize(iscol, &csize));
3638     if (csize == PETSC_DECIDE) {
3639       PetscCall(ISGetSize(isrow, &mglobal));
3640       if (mglobal == Ncols) { /* square matrix */
3641         nlocal = m;
3642       } else {
3643         nlocal = Ncols / size + ((Ncols % size) > rank);
3644       }
3645     } else {
3646       nlocal = csize;
3647     }
3648     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3649     rstart = rend - nlocal;
3650     PetscCheck(rank != size - 1 || rend == Ncols, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, Ncols);
3651 
3652     /* next, compute all the lengths */
3653     jj = aij->j;
3654     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3655     olens = dlens + m;
3656     for (i = 0; i < m; i++) {
3657       jend = ii[i + 1] - ii[i];
3658       olen = 0;
3659       dlen = 0;
3660       for (j = 0; j < jend; j++) {
3661         if (cmap[*jj] < rstart || cmap[*jj] >= rend) olen++;
3662         else dlen++;
3663         jj++;
3664       }
3665       olens[i] = olen;
3666       dlens[i] = dlen;
3667     }
3668 
3669     PetscCall(ISGetBlockSize(isrow, &bs));
3670     PetscCall(ISGetBlockSize(iscol, &cbs));
3671 
3672     PetscCall(MatCreate(comm, &M));
3673     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, Ncols));
3674     PetscCall(MatSetBlockSizes(M, bs, cbs));
3675     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3676     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3677     PetscCall(PetscFree(dlens));
3678 
3679   } else { /* call == MAT_REUSE_MATRIX */
3680     M = *newmat;
3681     PetscCall(MatGetLocalSize(M, &i, NULL));
3682     PetscCheck(i == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3683     PetscCall(MatZeroEntries(M));
3684     /*
3685          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3686        rather than the slower MatSetValues().
3687     */
3688     M->was_assembled = PETSC_TRUE;
3689     M->assembled     = PETSC_FALSE;
3690   }
3691 
3692   /* (5) Set values of Msub to *newmat */
3693   PetscCall(PetscMalloc1(count, &colsub));
3694   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
3695 
3696   jj = aij->j;
3697   PetscCall(MatSeqAIJGetArrayRead(Msub, (const PetscScalar **)&aa));
3698   for (i = 0; i < m; i++) {
3699     row = rstart + i;
3700     nz  = ii[i + 1] - ii[i];
3701     for (j = 0; j < nz; j++) colsub[j] = cmap[jj[j]];
3702     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, colsub, aa, INSERT_VALUES));
3703     jj += nz;
3704     aa += nz;
3705   }
3706   PetscCall(MatSeqAIJRestoreArrayRead(Msub, (const PetscScalar **)&aa));
3707   PetscCall(ISRestoreIndices(iscmap, &cmap));
3708 
3709   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3710   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3711 
3712   PetscCall(PetscFree(colsub));
3713 
3714   /* save Msub, iscol_sub and iscmap used in processor for next request */
3715   if (call == MAT_INITIAL_MATRIX) {
3716     *newmat = M;
3717     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubMatrix", (PetscObject)Msub));
3718     PetscCall(MatDestroy(&Msub));
3719 
3720     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubIScol", (PetscObject)iscol_sub));
3721     PetscCall(ISDestroy(&iscol_sub));
3722 
3723     PetscCall(PetscObjectCompose((PetscObject)*newmat, "Subcmap", (PetscObject)iscmap));
3724     PetscCall(ISDestroy(&iscmap));
3725 
3726     if (iscol_local) {
3727       PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3728       PetscCall(ISDestroy(&iscol_local));
3729     }
3730   }
3731   PetscFunctionReturn(PETSC_SUCCESS);
3732 }
3733 
3734 /*
3735     Not great since it makes two copies of the submatrix, first an SeqAIJ
3736   in local and then by concatenating the local matrices the end result.
3737   Writing it directly would be much like MatCreateSubMatrices_MPIAIJ()
3738 
3739   This requires a sequential iscol with all indices.
3740 */
3741 PetscErrorCode MatCreateSubMatrix_MPIAIJ_nonscalable(Mat mat, IS isrow, IS iscol, PetscInt csize, MatReuse call, Mat *newmat)
3742 {
3743   PetscMPIInt rank, size;
3744   PetscInt    i, m, n, rstart, row, rend, nz, *cwork, j, bs, cbs;
3745   PetscInt   *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3746   Mat         M, Mreuse;
3747   MatScalar  *aa, *vwork;
3748   MPI_Comm    comm;
3749   Mat_SeqAIJ *aij;
3750   PetscBool   colflag, allcolumns = PETSC_FALSE;
3751 
3752   PetscFunctionBegin;
3753   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3754   PetscCallMPI(MPI_Comm_rank(comm, &rank));
3755   PetscCallMPI(MPI_Comm_size(comm, &size));
3756 
3757   /* Check for special case: each processor gets entire matrix columns */
3758   PetscCall(ISIdentity(iscol, &colflag));
3759   PetscCall(ISGetLocalSize(iscol, &n));
3760   if (colflag && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3761   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3762 
3763   if (call == MAT_REUSE_MATRIX) {
3764     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Mreuse));
3765     PetscCheck(Mreuse, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3766     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_REUSE_MATRIX, allcolumns, &Mreuse));
3767   } else {
3768     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_INITIAL_MATRIX, allcolumns, &Mreuse));
3769   }
3770 
3771   /*
3772       m - number of local rows
3773       n - number of columns (same on all processors)
3774       rstart - first row in new global matrix generated
3775   */
3776   PetscCall(MatGetSize(Mreuse, &m, &n));
3777   PetscCall(MatGetBlockSizes(Mreuse, &bs, &cbs));
3778   if (call == MAT_INITIAL_MATRIX) {
3779     aij = (Mat_SeqAIJ *)Mreuse->data;
3780     ii  = aij->i;
3781     jj  = aij->j;
3782 
3783     /*
3784         Determine the number of non-zeros in the diagonal and off-diagonal
3785         portions of the matrix in order to do correct preallocation
3786     */
3787 
3788     /* first get start and end of "diagonal" columns */
3789     if (csize == PETSC_DECIDE) {
3790       PetscCall(ISGetSize(isrow, &mglobal));
3791       if (mglobal == n) { /* square matrix */
3792         nlocal = m;
3793       } else {
3794         nlocal = n / size + ((n % size) > rank);
3795       }
3796     } else {
3797       nlocal = csize;
3798     }
3799     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3800     rstart = rend - nlocal;
3801     PetscCheck(rank != size - 1 || rend == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, n);
3802 
3803     /* next, compute all the lengths */
3804     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3805     olens = dlens + m;
3806     for (i = 0; i < m; i++) {
3807       jend = ii[i + 1] - ii[i];
3808       olen = 0;
3809       dlen = 0;
3810       for (j = 0; j < jend; j++) {
3811         if (*jj < rstart || *jj >= rend) olen++;
3812         else dlen++;
3813         jj++;
3814       }
3815       olens[i] = olen;
3816       dlens[i] = dlen;
3817     }
3818     PetscCall(MatCreate(comm, &M));
3819     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, n));
3820     PetscCall(MatSetBlockSizes(M, bs, cbs));
3821     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3822     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3823     PetscCall(PetscFree(dlens));
3824   } else {
3825     PetscInt ml, nl;
3826 
3827     M = *newmat;
3828     PetscCall(MatGetLocalSize(M, &ml, &nl));
3829     PetscCheck(ml == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3830     PetscCall(MatZeroEntries(M));
3831     /*
3832          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3833        rather than the slower MatSetValues().
3834     */
3835     M->was_assembled = PETSC_TRUE;
3836     M->assembled     = PETSC_FALSE;
3837   }
3838   PetscCall(MatGetOwnershipRange(M, &rstart, &rend));
3839   aij = (Mat_SeqAIJ *)Mreuse->data;
3840   ii  = aij->i;
3841   jj  = aij->j;
3842 
3843   /* trigger copy to CPU if needed */
3844   PetscCall(MatSeqAIJGetArrayRead(Mreuse, (const PetscScalar **)&aa));
3845   for (i = 0; i < m; i++) {
3846     row   = rstart + i;
3847     nz    = ii[i + 1] - ii[i];
3848     cwork = jj;
3849     jj    = PetscSafePointerPlusOffset(jj, nz);
3850     vwork = aa;
3851     aa    = PetscSafePointerPlusOffset(aa, nz);
3852     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, cwork, vwork, INSERT_VALUES));
3853   }
3854   PetscCall(MatSeqAIJRestoreArrayRead(Mreuse, (const PetscScalar **)&aa));
3855 
3856   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3857   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3858   *newmat = M;
3859 
3860   /* save submatrix used in processor for next request */
3861   if (call == MAT_INITIAL_MATRIX) {
3862     PetscCall(PetscObjectCompose((PetscObject)M, "SubMatrix", (PetscObject)Mreuse));
3863     PetscCall(MatDestroy(&Mreuse));
3864   }
3865   PetscFunctionReturn(PETSC_SUCCESS);
3866 }
3867 
3868 static PetscErrorCode MatMPIAIJSetPreallocationCSR_MPIAIJ(Mat B, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
3869 {
3870   PetscInt        m, cstart, cend, j, nnz, i, d, *ld;
3871   PetscInt       *d_nnz, *o_nnz, nnz_max = 0, rstart, ii, irstart;
3872   const PetscInt *JJ;
3873   PetscBool       nooffprocentries;
3874   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)B->data;
3875 
3876   PetscFunctionBegin;
3877   PetscCall(PetscLayoutSetUp(B->rmap));
3878   PetscCall(PetscLayoutSetUp(B->cmap));
3879   m       = B->rmap->n;
3880   cstart  = B->cmap->rstart;
3881   cend    = B->cmap->rend;
3882   rstart  = B->rmap->rstart;
3883   irstart = Ii[0];
3884 
3885   PetscCall(PetscCalloc2(m, &d_nnz, m, &o_nnz));
3886 
3887   if (PetscDefined(USE_DEBUG)) {
3888     for (i = 0; i < m; i++) {
3889       nnz = Ii[i + 1] - Ii[i];
3890       JJ  = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3891       PetscCheck(nnz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative %" PetscInt_FMT " number of columns", i, nnz);
3892       PetscCheck(!nnz || !(JJ[0] < 0), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " starts with negative column index %" PetscInt_FMT, i, JJ[0]);
3893       PetscCheck(!nnz || !(JJ[nnz - 1] >= B->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " ends with too large a column index %" PetscInt_FMT " (max allowed %" PetscInt_FMT ")", i, JJ[nnz - 1], B->cmap->N);
3894     }
3895   }
3896 
3897   for (i = 0; i < m; i++) {
3898     nnz     = Ii[i + 1] - Ii[i];
3899     JJ      = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3900     nnz_max = PetscMax(nnz_max, nnz);
3901     d       = 0;
3902     for (j = 0; j < nnz; j++) {
3903       if (cstart <= JJ[j] && JJ[j] < cend) d++;
3904     }
3905     d_nnz[i] = d;
3906     o_nnz[i] = nnz - d;
3907   }
3908   PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
3909   PetscCall(PetscFree2(d_nnz, o_nnz));
3910 
3911   for (i = 0; i < m; i++) {
3912     ii = i + rstart;
3913     PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, Ii[i + 1] - Ii[i], PetscSafePointerPlusOffset(J, Ii[i] - irstart), PetscSafePointerPlusOffset(v, Ii[i] - irstart), INSERT_VALUES));
3914   }
3915   nooffprocentries    = B->nooffprocentries;
3916   B->nooffprocentries = PETSC_TRUE;
3917   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3918   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3919   B->nooffprocentries = nooffprocentries;
3920 
3921   /* count number of entries below block diagonal */
3922   PetscCall(PetscFree(Aij->ld));
3923   PetscCall(PetscCalloc1(m, &ld));
3924   Aij->ld = ld;
3925   for (i = 0; i < m; i++) {
3926     nnz = Ii[i + 1] - Ii[i];
3927     j   = 0;
3928     while (j < nnz && J[j] < cstart) j++;
3929     ld[i] = j;
3930     if (J) J += nnz;
3931   }
3932 
3933   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3934   PetscFunctionReturn(PETSC_SUCCESS);
3935 }
3936 
3937 /*@
3938   MatMPIAIJSetPreallocationCSR - Allocates memory for a sparse parallel matrix in `MATAIJ` format
3939   (the default parallel PETSc format).
3940 
3941   Collective
3942 
3943   Input Parameters:
3944 + B - the matrix
3945 . i - the indices into `j` for the start of each local row (indices start with zero)
3946 . j - the column indices for each local row (indices start with zero)
3947 - v - optional values in the matrix
3948 
3949   Level: developer
3950 
3951   Notes:
3952   The `i`, `j`, and `v` arrays ARE copied by this routine into the internal format used by PETSc;
3953   thus you CANNOT change the matrix entries by changing the values of `v` after you have
3954   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
3955 
3956   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
3957 
3958   A convenience routine for this functionality is `MatCreateMPIAIJWithArrays()`.
3959 
3960   You can update the matrix with new numerical values using `MatUpdateMPIAIJWithArrays()` after this call if the column indices in `j` are sorted.
3961 
3962   If you do **not** use `MatUpdateMPIAIJWithArrays()`, the column indices in `j` do not need to be sorted. If you will use
3963   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
3964 
3965   The format which is used for the sparse matrix input, is equivalent to a
3966   row-major ordering.. i.e for the following matrix, the input data expected is
3967   as shown
3968 .vb
3969         1 0 0
3970         2 0 3     P0
3971        -------
3972         4 5 6     P1
3973 
3974      Process0 [P0] rows_owned=[0,1]
3975         i =  {0,1,3}  [size = nrow+1  = 2+1]
3976         j =  {0,0,2}  [size = 3]
3977         v =  {1,2,3}  [size = 3]
3978 
3979      Process1 [P1] rows_owned=[2]
3980         i =  {0,3}    [size = nrow+1  = 1+1]
3981         j =  {0,1,2}  [size = 3]
3982         v =  {4,5,6}  [size = 3]
3983 .ve
3984 
3985 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatCreateAIJ()`,
3986           `MatCreateSeqAIJWithArrays()`, `MatCreateMPIAIJWithSplitArrays()`, `MatCreateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3987 @*/
3988 PetscErrorCode MatMPIAIJSetPreallocationCSR(Mat B, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
3989 {
3990   PetscFunctionBegin;
3991   PetscTryMethod(B, "MatMPIAIJSetPreallocationCSR_C", (Mat, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, i, j, v));
3992   PetscFunctionReturn(PETSC_SUCCESS);
3993 }
3994 
3995 /*@
3996   MatMPIAIJSetPreallocation - Preallocates memory for a sparse parallel matrix in `MATMPIAIJ` format
3997   (the default parallel PETSc format).  For good matrix assembly performance
3998   the user should preallocate the matrix storage by setting the parameters
3999   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4000 
4001   Collective
4002 
4003   Input Parameters:
4004 + B     - the matrix
4005 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4006            (same value is used for all local rows)
4007 . d_nnz - array containing the number of nonzeros in the various rows of the
4008            DIAGONAL portion of the local submatrix (possibly different for each row)
4009            or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `d_nz` is used to specify the nonzero structure.
4010            The size of this array is equal to the number of local rows, i.e 'm'.
4011            For matrices that will be factored, you must leave room for (and set)
4012            the diagonal entry even if it is zero.
4013 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4014            submatrix (same value is used for all local rows).
4015 - o_nnz - array containing the number of nonzeros in the various rows of the
4016            OFF-DIAGONAL portion of the local submatrix (possibly different for
4017            each row) or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `o_nz` is used to specify the nonzero
4018            structure. The size of this array is equal to the number
4019            of local rows, i.e 'm'.
4020 
4021   Example Usage:
4022   Consider the following 8x8 matrix with 34 non-zero values, that is
4023   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4024   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4025   as follows
4026 
4027 .vb
4028             1  2  0  |  0  3  0  |  0  4
4029     Proc0   0  5  6  |  7  0  0  |  8  0
4030             9  0 10  | 11  0  0  | 12  0
4031     -------------------------------------
4032            13  0 14  | 15 16 17  |  0  0
4033     Proc1   0 18  0  | 19 20 21  |  0  0
4034             0  0  0  | 22 23  0  | 24  0
4035     -------------------------------------
4036     Proc2  25 26 27  |  0  0 28  | 29  0
4037            30  0  0  | 31 32 33  |  0 34
4038 .ve
4039 
4040   This can be represented as a collection of submatrices as
4041 .vb
4042       A B C
4043       D E F
4044       G H I
4045 .ve
4046 
4047   Where the submatrices A,B,C are owned by proc0, D,E,F are
4048   owned by proc1, G,H,I are owned by proc2.
4049 
4050   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4051   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4052   The 'M','N' parameters are 8,8, and have the same values on all procs.
4053 
4054   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4055   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4056   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4057   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4058   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4059   matrix, and [DF] as another `MATSEQAIJ` matrix.
4060 
4061   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4062   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4063   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4064   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4065   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4066   In this case, the values of `d_nz`, `o_nz` are
4067 .vb
4068      proc0  dnz = 2, o_nz = 2
4069      proc1  dnz = 3, o_nz = 2
4070      proc2  dnz = 1, o_nz = 4
4071 .ve
4072   We are allocating `m`*(`d_nz`+`o_nz`) storage locations for every proc. This
4073   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4074   for proc3. i.e we are using 12+15+10=37 storage locations to store
4075   34 values.
4076 
4077   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4078   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4079   In the above case the values for `d_nnz`, `o_nnz` are
4080 .vb
4081      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4082      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4083      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4084 .ve
4085   Here the space allocated is sum of all the above values i.e 34, and
4086   hence pre-allocation is perfect.
4087 
4088   Level: intermediate
4089 
4090   Notes:
4091   If the *_nnz parameter is given then the *_nz parameter is ignored
4092 
4093   The `MATAIJ` format, also called compressed row storage (CSR), is compatible with standard Fortran
4094   storage.  The stored row and column indices begin with zero.
4095   See [Sparse Matrices](sec_matsparse) for details.
4096 
4097   The parallel matrix is partitioned such that the first m0 rows belong to
4098   process 0, the next m1 rows belong to process 1, the next m2 rows belong
4099   to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
4100 
4101   The DIAGONAL portion of the local submatrix of a processor can be defined
4102   as the submatrix which is obtained by extraction the part corresponding to
4103   the rows r1-r2 and columns c1-c2 of the global matrix, where r1 is the
4104   first row that belongs to the processor, r2 is the last row belonging to
4105   the this processor, and c1-c2 is range of indices of the local part of a
4106   vector suitable for applying the matrix to.  This is an mxn matrix.  In the
4107   common case of a square matrix, the row and column ranges are the same and
4108   the DIAGONAL part is also square. The remaining portion of the local
4109   submatrix (mxN) constitute the OFF-DIAGONAL portion.
4110 
4111   If `o_nnz` and `d_nnz` are specified, then `o_nz` and `d_nz` are ignored.
4112 
4113   You can call `MatGetInfo()` to get information on how effective the preallocation was;
4114   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
4115   You can also run with the option `-info` and look for messages with the string
4116   malloc in them to see if additional memory allocation was needed.
4117 
4118 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MATMPIAIJ`, `MATAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateAIJ()`, `MatMPIAIJSetPreallocationCSR()`,
4119           `MatGetInfo()`, `PetscSplitOwnership()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4120 @*/
4121 PetscErrorCode MatMPIAIJSetPreallocation(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
4122 {
4123   PetscFunctionBegin;
4124   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
4125   PetscValidType(B, 1);
4126   PetscTryMethod(B, "MatMPIAIJSetPreallocation_C", (Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[]), (B, d_nz, d_nnz, o_nz, o_nnz));
4127   PetscFunctionReturn(PETSC_SUCCESS);
4128 }
4129 
4130 /*@
4131   MatCreateMPIAIJWithArrays - creates a `MATMPIAIJ` matrix using arrays that contain in standard
4132   CSR format for the local rows.
4133 
4134   Collective
4135 
4136   Input Parameters:
4137 + comm - MPI communicator
4138 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
4139 . n    - This value should be the same as the local size used in creating the
4140          x vector for the matrix-vector product $ y = Ax$. (or `PETSC_DECIDE` to have
4141          calculated if `N` is given) For square matrices n is almost always `m`.
4142 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
4143 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
4144 . i    - row indices (of length m+1); that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
4145 . j    - global column indices
4146 - a    - optional matrix values
4147 
4148   Output Parameter:
4149 . mat - the matrix
4150 
4151   Level: intermediate
4152 
4153   Notes:
4154   The `i`, `j`, and `a` arrays ARE copied by this routine into the internal format used by PETSc;
4155   thus you CANNOT change the matrix entries by changing the values of `a[]` after you have
4156   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
4157 
4158   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
4159 
4160   Once you have created the matrix you can update it with new numerical values using `MatUpdateMPIAIJWithArray()`
4161 
4162   If you do **not** use `MatUpdateMPIAIJWithArray()`, the column indices in `j` do not need to be sorted. If you will use
4163   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4164 
4165   The format which is used for the sparse matrix input, is equivalent to a
4166   row-major ordering, i.e., for the following matrix, the input data expected is
4167   as shown
4168 .vb
4169         1 0 0
4170         2 0 3     P0
4171        -------
4172         4 5 6     P1
4173 
4174      Process0 [P0] rows_owned=[0,1]
4175         i =  {0,1,3}  [size = nrow+1  = 2+1]
4176         j =  {0,0,2}  [size = 3]
4177         v =  {1,2,3}  [size = 3]
4178 
4179      Process1 [P1] rows_owned=[2]
4180         i =  {0,3}    [size = nrow+1  = 1+1]
4181         j =  {0,1,2}  [size = 3]
4182         v =  {4,5,6}  [size = 3]
4183 .ve
4184 
4185 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4186           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4187 @*/
4188 PetscErrorCode MatCreateMPIAIJWithArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt i[], const PetscInt j[], const PetscScalar a[], Mat *mat)
4189 {
4190   PetscFunctionBegin;
4191   PetscCheck(!i || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4192   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4193   PetscCall(MatCreate(comm, mat));
4194   PetscCall(MatSetSizes(*mat, m, n, M, N));
4195   /* PetscCall(MatSetBlockSizes(M,bs,cbs)); */
4196   PetscCall(MatSetType(*mat, MATMPIAIJ));
4197   PetscCall(MatMPIAIJSetPreallocationCSR(*mat, i, j, a));
4198   PetscFunctionReturn(PETSC_SUCCESS);
4199 }
4200 
4201 /*@
4202   MatUpdateMPIAIJWithArrays - updates a `MATMPIAIJ` matrix using arrays that contain in standard
4203   CSR format for the local rows. Only the numerical values are updated the other arrays must be identical to what was passed
4204   from `MatCreateMPIAIJWithArrays()`
4205 
4206   Deprecated: Use `MatUpdateMPIAIJWithArray()`
4207 
4208   Collective
4209 
4210   Input Parameters:
4211 + mat - the matrix
4212 . m   - number of local rows (Cannot be `PETSC_DECIDE`)
4213 . n   - This value should be the same as the local size used in creating the
4214        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4215        calculated if N is given) For square matrices n is almost always m.
4216 . M   - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4217 . N   - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4218 . Ii  - row indices; that is Ii[0] = 0, Ii[row] = Ii[row-1] + number of elements in that row of the matrix
4219 . J   - column indices
4220 - v   - matrix values
4221 
4222   Level: deprecated
4223 
4224 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4225           `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4226 @*/
4227 PetscErrorCode MatUpdateMPIAIJWithArrays(Mat mat, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
4228 {
4229   PetscInt        nnz, i;
4230   PetscBool       nooffprocentries;
4231   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4232   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4233   PetscScalar    *ad, *ao;
4234   PetscInt        ldi, Iii, md;
4235   const PetscInt *Adi = Ad->i;
4236   PetscInt       *ld  = Aij->ld;
4237 
4238   PetscFunctionBegin;
4239   PetscCheck(Ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4240   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4241   PetscCheck(m == mat->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of rows cannot change from call to MatUpdateMPIAIJWithArrays()");
4242   PetscCheck(n == mat->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of columns cannot change from call to MatUpdateMPIAIJWithArrays()");
4243 
4244   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4245   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4246 
4247   for (i = 0; i < m; i++) {
4248     if (PetscDefined(USE_DEBUG)) {
4249       for (PetscInt j = Ii[i] + 1; j < Ii[i + 1]; ++j) {
4250         PetscCheck(J[j] >= J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is not sorted", j - Ii[i], J[j], i);
4251         PetscCheck(J[j] != J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is identical to previous entry", j - Ii[i], J[j], i);
4252       }
4253     }
4254     nnz = Ii[i + 1] - Ii[i];
4255     Iii = Ii[i];
4256     ldi = ld[i];
4257     md  = Adi[i + 1] - Adi[i];
4258     PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4259     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4260     PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4261     ad += md;
4262     ao += nnz - md;
4263   }
4264   nooffprocentries      = mat->nooffprocentries;
4265   mat->nooffprocentries = PETSC_TRUE;
4266   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4267   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4268   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4269   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4270   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4271   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4272   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4273   mat->nooffprocentries = nooffprocentries;
4274   PetscFunctionReturn(PETSC_SUCCESS);
4275 }
4276 
4277 /*@
4278   MatUpdateMPIAIJWithArray - updates an `MATMPIAIJ` matrix using an array that contains the nonzero values
4279 
4280   Collective
4281 
4282   Input Parameters:
4283 + mat - the matrix
4284 - v   - matrix values, stored by row
4285 
4286   Level: intermediate
4287 
4288   Notes:
4289   The matrix must have been obtained with `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()`
4290 
4291   The column indices in the call to `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()` must have been sorted for this call to work correctly
4292 
4293 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4294           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4295 @*/
4296 PetscErrorCode MatUpdateMPIAIJWithArray(Mat mat, const PetscScalar v[])
4297 {
4298   PetscInt        nnz, i, m;
4299   PetscBool       nooffprocentries;
4300   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4301   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4302   Mat_SeqAIJ     *Ao  = (Mat_SeqAIJ *)Aij->B->data;
4303   PetscScalar    *ad, *ao;
4304   const PetscInt *Adi = Ad->i, *Adj = Ao->i;
4305   PetscInt        ldi, Iii, md;
4306   PetscInt       *ld = Aij->ld;
4307 
4308   PetscFunctionBegin;
4309   m = mat->rmap->n;
4310 
4311   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4312   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4313   Iii = 0;
4314   for (i = 0; i < m; i++) {
4315     nnz = Adi[i + 1] - Adi[i] + Adj[i + 1] - Adj[i];
4316     ldi = ld[i];
4317     md  = Adi[i + 1] - Adi[i];
4318     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4319     ad += md;
4320     if (ao) {
4321       PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4322       PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4323       ao += nnz - md;
4324     }
4325     Iii += nnz;
4326   }
4327   nooffprocentries      = mat->nooffprocentries;
4328   mat->nooffprocentries = PETSC_TRUE;
4329   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4330   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4331   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4332   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4333   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4334   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4335   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4336   mat->nooffprocentries = nooffprocentries;
4337   PetscFunctionReturn(PETSC_SUCCESS);
4338 }
4339 
4340 /*@
4341   MatCreateAIJ - Creates a sparse parallel matrix in `MATAIJ` format
4342   (the default parallel PETSc format).  For good matrix assembly performance
4343   the user should preallocate the matrix storage by setting the parameters
4344   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4345 
4346   Collective
4347 
4348   Input Parameters:
4349 + comm  - MPI communicator
4350 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
4351           This value should be the same as the local size used in creating the
4352           y vector for the matrix-vector product y = Ax.
4353 . n     - This value should be the same as the local size used in creating the
4354           x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4355           calculated if N is given) For square matrices n is almost always m.
4356 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4357 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4358 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4359           (same value is used for all local rows)
4360 . d_nnz - array containing the number of nonzeros in the various rows of the
4361           DIAGONAL portion of the local submatrix (possibly different for each row)
4362           or `NULL`, if `d_nz` is used to specify the nonzero structure.
4363           The size of this array is equal to the number of local rows, i.e 'm'.
4364 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4365           submatrix (same value is used for all local rows).
4366 - o_nnz - array containing the number of nonzeros in the various rows of the
4367           OFF-DIAGONAL portion of the local submatrix (possibly different for
4368           each row) or `NULL`, if `o_nz` is used to specify the nonzero
4369           structure. The size of this array is equal to the number
4370           of local rows, i.e 'm'.
4371 
4372   Output Parameter:
4373 . A - the matrix
4374 
4375   Options Database Keys:
4376 + -mat_no_inode                     - Do not use inodes
4377 . -mat_inode_limit <limit>          - Sets inode limit (max limit=5)
4378 - -matmult_vecscatter_view <viewer> - View the vecscatter (i.e., communication pattern) used in `MatMult()` of sparse parallel matrices.
4379                                       See viewer types in manual of `MatView()`. Of them, ascii_matlab, draw or binary cause the `VecScatter`
4380                                       to be viewed as a matrix. Entry (i,j) is the size of message (in bytes) rank i sends to rank j in one `MatMult()` call.
4381 
4382   Level: intermediate
4383 
4384   Notes:
4385   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
4386   MatXXXXSetPreallocation() paradigm instead of this routine directly.
4387   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
4388 
4389   If the *_nnz parameter is given then the *_nz parameter is ignored
4390 
4391   The `m`,`n`,`M`,`N` parameters specify the size of the matrix, and its partitioning across
4392   processors, while `d_nz`,`d_nnz`,`o_nz`,`o_nnz` parameters specify the approximate
4393   storage requirements for this matrix.
4394 
4395   If `PETSC_DECIDE` or  `PETSC_DETERMINE` is used for a particular argument on one
4396   processor than it must be used on all processors that share the object for
4397   that argument.
4398 
4399   If `m` and `n` are not `PETSC_DECIDE`, then the values determine the `PetscLayout` of the matrix and the ranges returned by
4400   `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`, and `MatGetOwnershipRangesColumn()`.
4401 
4402   The user MUST specify either the local or global matrix dimensions
4403   (possibly both).
4404 
4405   The parallel matrix is partitioned across processors such that the
4406   first `m0` rows belong to process 0, the next `m1` rows belong to
4407   process 1, the next `m2` rows belong to process 2, etc., where
4408   `m0`, `m1`, `m2`... are the input parameter `m` on each MPI process. I.e., each MPI process stores
4409   values corresponding to [m x N] submatrix.
4410 
4411   The columns are logically partitioned with the n0 columns belonging
4412   to 0th partition, the next n1 columns belonging to the next
4413   partition etc.. where n0,n1,n2... are the input parameter 'n'.
4414 
4415   The DIAGONAL portion of the local submatrix on any given processor
4416   is the submatrix corresponding to the rows and columns m,n
4417   corresponding to the given processor. i.e diagonal matrix on
4418   process 0 is [m0 x n0], diagonal matrix on process 1 is [m1 x n1]
4419   etc. The remaining portion of the local submatrix [m x (N-n)]
4420   constitute the OFF-DIAGONAL portion. The example below better
4421   illustrates this concept. The two matrices, the DIAGONAL portion and
4422   the OFF-DIAGONAL portion are each stored as `MATSEQAIJ` matrices.
4423 
4424   For a square global matrix we define each processor's diagonal portion
4425   to be its local rows and the corresponding columns (a square submatrix);
4426   each processor's off-diagonal portion encompasses the remainder of the
4427   local matrix (a rectangular submatrix).
4428 
4429   If `o_nnz`, `d_nnz` are specified, then `o_nz`, and `d_nz` are ignored.
4430 
4431   When calling this routine with a single process communicator, a matrix of
4432   type `MATSEQAIJ` is returned.  If a matrix of type `MATMPIAIJ` is desired for this
4433   type of communicator, use the construction mechanism
4434 .vb
4435   MatCreate(..., &A);
4436   MatSetType(A, MATMPIAIJ);
4437   MatSetSizes(A, m, n, M, N);
4438   MatMPIAIJSetPreallocation(A, ...);
4439 .ve
4440 
4441   By default, this format uses inodes (identical nodes) when possible.
4442   We search for consecutive rows with the same nonzero structure, thereby
4443   reusing matrix information to achieve increased efficiency.
4444 
4445   Example Usage:
4446   Consider the following 8x8 matrix with 34 non-zero values, that is
4447   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4448   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4449   as follows
4450 
4451 .vb
4452             1  2  0  |  0  3  0  |  0  4
4453     Proc0   0  5  6  |  7  0  0  |  8  0
4454             9  0 10  | 11  0  0  | 12  0
4455     -------------------------------------
4456            13  0 14  | 15 16 17  |  0  0
4457     Proc1   0 18  0  | 19 20 21  |  0  0
4458             0  0  0  | 22 23  0  | 24  0
4459     -------------------------------------
4460     Proc2  25 26 27  |  0  0 28  | 29  0
4461            30  0  0  | 31 32 33  |  0 34
4462 .ve
4463 
4464   This can be represented as a collection of submatrices as
4465 
4466 .vb
4467       A B C
4468       D E F
4469       G H I
4470 .ve
4471 
4472   Where the submatrices A,B,C are owned by proc0, D,E,F are
4473   owned by proc1, G,H,I are owned by proc2.
4474 
4475   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4476   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4477   The 'M','N' parameters are 8,8, and have the same values on all procs.
4478 
4479   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4480   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4481   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4482   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4483   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4484   matrix, and [DF] as another SeqAIJ matrix.
4485 
4486   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4487   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4488   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4489   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4490   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4491   In this case, the values of `d_nz`,`o_nz` are
4492 .vb
4493      proc0  dnz = 2, o_nz = 2
4494      proc1  dnz = 3, o_nz = 2
4495      proc2  dnz = 1, o_nz = 4
4496 .ve
4497   We are allocating m*(`d_nz`+`o_nz`) storage locations for every proc. This
4498   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4499   for proc3. i.e we are using 12+15+10=37 storage locations to store
4500   34 values.
4501 
4502   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4503   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4504   In the above case the values for d_nnz,o_nnz are
4505 .vb
4506      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4507      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4508      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4509 .ve
4510   Here the space allocated is sum of all the above values i.e 34, and
4511   hence pre-allocation is perfect.
4512 
4513 .seealso: [](ch_matrices), `Mat`, [Sparse Matrix Creation](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4514           `MATMPIAIJ`, `MatCreateMPIAIJWithArrays()`, `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`,
4515           `MatGetOwnershipRangesColumn()`, `PetscLayout`
4516 @*/
4517 PetscErrorCode MatCreateAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
4518 {
4519   PetscMPIInt size;
4520 
4521   PetscFunctionBegin;
4522   PetscCall(MatCreate(comm, A));
4523   PetscCall(MatSetSizes(*A, m, n, M, N));
4524   PetscCallMPI(MPI_Comm_size(comm, &size));
4525   if (size > 1) {
4526     PetscCall(MatSetType(*A, MATMPIAIJ));
4527     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
4528   } else {
4529     PetscCall(MatSetType(*A, MATSEQAIJ));
4530     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
4531   }
4532   PetscFunctionReturn(PETSC_SUCCESS);
4533 }
4534 
4535 /*@C
4536   MatMPIAIJGetSeqAIJ - Returns the local pieces of this distributed matrix
4537 
4538   Not Collective
4539 
4540   Input Parameter:
4541 . A - The `MATMPIAIJ` matrix
4542 
4543   Output Parameters:
4544 + Ad     - The local diagonal block as a `MATSEQAIJ` matrix
4545 . Ao     - The local off-diagonal block as a `MATSEQAIJ` matrix
4546 - colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4547 
4548   Level: intermediate
4549 
4550   Note:
4551   The rows in `Ad` and `Ao` are in [0, Nr), where Nr is the number of local rows on this process. The columns
4552   in `Ad` are in [0, Nc) where Nc is the number of local columns. The columns are `Ao` are in [0, Nco), where Nco is
4553   the number of nonzero columns in the local off-diagonal piece of the matrix `A`. The array colmap maps these
4554   local column numbers to global column numbers in the original matrix.
4555 
4556 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`, `MatCreateAIJ()`, `MATSEQAIJ`
4557 @*/
4558 PetscErrorCode MatMPIAIJGetSeqAIJ(Mat A, Mat *Ad, Mat *Ao, const PetscInt *colmap[])
4559 {
4560   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4561   PetscBool   flg;
4562 
4563   PetscFunctionBegin;
4564   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &flg));
4565   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "This function requires a MATMPIAIJ matrix as input");
4566   if (Ad) *Ad = a->A;
4567   if (Ao) *Ao = a->B;
4568   if (colmap) *colmap = a->garray;
4569   PetscFunctionReturn(PETSC_SUCCESS);
4570 }
4571 
4572 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
4573 {
4574   PetscInt     m, N, i, rstart, nnz, Ii;
4575   PetscInt    *indx;
4576   PetscScalar *values;
4577   MatType      rootType;
4578 
4579   PetscFunctionBegin;
4580   PetscCall(MatGetSize(inmat, &m, &N));
4581   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
4582     PetscInt *dnz, *onz, sum, bs, cbs;
4583 
4584     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
4585     /* Check sum(n) = N */
4586     PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
4587     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
4588 
4589     PetscCallMPI(MPI_Scan(&m, &rstart, 1, MPIU_INT, MPI_SUM, comm));
4590     rstart -= m;
4591 
4592     MatPreallocateBegin(comm, m, n, dnz, onz);
4593     for (i = 0; i < m; i++) {
4594       PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4595       PetscCall(MatPreallocateSet(i + rstart, nnz, indx, dnz, onz));
4596       PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4597     }
4598 
4599     PetscCall(MatCreate(comm, outmat));
4600     PetscCall(MatSetSizes(*outmat, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4601     PetscCall(MatGetBlockSizes(inmat, &bs, &cbs));
4602     PetscCall(MatSetBlockSizes(*outmat, bs, cbs));
4603     PetscCall(MatGetRootType_Private(inmat, &rootType));
4604     PetscCall(MatSetType(*outmat, rootType));
4605     PetscCall(MatSeqAIJSetPreallocation(*outmat, 0, dnz));
4606     PetscCall(MatMPIAIJSetPreallocation(*outmat, 0, dnz, 0, onz));
4607     MatPreallocateEnd(dnz, onz);
4608     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
4609   }
4610 
4611   /* numeric phase */
4612   PetscCall(MatGetOwnershipRange(*outmat, &rstart, NULL));
4613   for (i = 0; i < m; i++) {
4614     PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4615     Ii = i + rstart;
4616     PetscCall(MatSetValues(*outmat, 1, &Ii, nnz, indx, values, INSERT_VALUES));
4617     PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4618   }
4619   PetscCall(MatAssemblyBegin(*outmat, MAT_FINAL_ASSEMBLY));
4620   PetscCall(MatAssemblyEnd(*outmat, MAT_FINAL_ASSEMBLY));
4621   PetscFunctionReturn(PETSC_SUCCESS);
4622 }
4623 
4624 static PetscErrorCode MatMergeSeqsToMPIDestroy(void **data)
4625 {
4626   MatMergeSeqsToMPI *merge = (MatMergeSeqsToMPI *)*data;
4627 
4628   PetscFunctionBegin;
4629   if (!merge) PetscFunctionReturn(PETSC_SUCCESS);
4630   PetscCall(PetscFree(merge->id_r));
4631   PetscCall(PetscFree(merge->len_s));
4632   PetscCall(PetscFree(merge->len_r));
4633   PetscCall(PetscFree(merge->bi));
4634   PetscCall(PetscFree(merge->bj));
4635   PetscCall(PetscFree(merge->buf_ri[0]));
4636   PetscCall(PetscFree(merge->buf_ri));
4637   PetscCall(PetscFree(merge->buf_rj[0]));
4638   PetscCall(PetscFree(merge->buf_rj));
4639   PetscCall(PetscFree(merge->coi));
4640   PetscCall(PetscFree(merge->coj));
4641   PetscCall(PetscFree(merge->owners_co));
4642   PetscCall(PetscLayoutDestroy(&merge->rowmap));
4643   PetscCall(PetscFree(merge));
4644   PetscFunctionReturn(PETSC_SUCCESS);
4645 }
4646 
4647 #include <../src/mat/utils/freespace.h>
4648 #include <petscbt.h>
4649 
4650 PetscErrorCode MatCreateMPIAIJSumSeqAIJNumeric(Mat seqmat, Mat mpimat)
4651 {
4652   MPI_Comm           comm;
4653   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)seqmat->data;
4654   PetscMPIInt        size, rank, taga, *len_s;
4655   PetscInt           N = mpimat->cmap->N, i, j, *owners, *ai = a->i, *aj, m;
4656   PetscMPIInt        proc, k;
4657   PetscInt         **buf_ri, **buf_rj;
4658   PetscInt           anzi, *bj_i, *bi, *bj, arow, bnzi, nextaj;
4659   PetscInt           nrows, **buf_ri_k, **nextrow, **nextai;
4660   MPI_Request       *s_waits, *r_waits;
4661   MPI_Status        *status;
4662   const MatScalar   *aa, *a_a;
4663   MatScalar        **abuf_r, *ba_i;
4664   MatMergeSeqsToMPI *merge;
4665   PetscContainer     container;
4666 
4667   PetscFunctionBegin;
4668   PetscCall(PetscObjectGetComm((PetscObject)mpimat, &comm));
4669   PetscCall(PetscLogEventBegin(MAT_Seqstompinum, seqmat, 0, 0, 0));
4670 
4671   PetscCallMPI(MPI_Comm_size(comm, &size));
4672   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4673 
4674   PetscCall(PetscObjectQuery((PetscObject)mpimat, "MatMergeSeqsToMPI", (PetscObject *)&container));
4675   PetscCheck(container, PetscObjectComm((PetscObject)mpimat), PETSC_ERR_PLIB, "Mat not created from MatCreateMPIAIJSumSeqAIJSymbolic");
4676   PetscCall(PetscContainerGetPointer(container, (void **)&merge));
4677   PetscCall(MatSeqAIJGetArrayRead(seqmat, &a_a));
4678   aa = a_a;
4679 
4680   bi     = merge->bi;
4681   bj     = merge->bj;
4682   buf_ri = merge->buf_ri;
4683   buf_rj = merge->buf_rj;
4684 
4685   PetscCall(PetscMalloc1(size, &status));
4686   owners = merge->rowmap->range;
4687   len_s  = merge->len_s;
4688 
4689   /* send and recv matrix values */
4690   PetscCall(PetscObjectGetNewTag((PetscObject)mpimat, &taga));
4691   PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits));
4692 
4693   PetscCall(PetscMalloc1(merge->nsend + 1, &s_waits));
4694   for (proc = 0, k = 0; proc < size; proc++) {
4695     if (!len_s[proc]) continue;
4696     i = owners[proc];
4697     PetscCallMPI(MPIU_Isend(aa + ai[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k));
4698     k++;
4699   }
4700 
4701   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status));
4702   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status));
4703   PetscCall(PetscFree(status));
4704 
4705   PetscCall(PetscFree(s_waits));
4706   PetscCall(PetscFree(r_waits));
4707 
4708   /* insert mat values of mpimat */
4709   PetscCall(PetscMalloc1(N, &ba_i));
4710   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4711 
4712   for (k = 0; k < merge->nrecv; k++) {
4713     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4714     nrows       = *buf_ri_k[k];
4715     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4716     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4717   }
4718 
4719   /* set values of ba */
4720   m = merge->rowmap->n;
4721   for (i = 0; i < m; i++) {
4722     arow = owners[rank] + i;
4723     bj_i = bj + bi[i]; /* col indices of the i-th row of mpimat */
4724     bnzi = bi[i + 1] - bi[i];
4725     PetscCall(PetscArrayzero(ba_i, bnzi));
4726 
4727     /* add local non-zero vals of this proc's seqmat into ba */
4728     anzi   = ai[arow + 1] - ai[arow];
4729     aj     = a->j + ai[arow];
4730     aa     = a_a + ai[arow];
4731     nextaj = 0;
4732     for (j = 0; nextaj < anzi; j++) {
4733       if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4734         ba_i[j] += aa[nextaj++];
4735       }
4736     }
4737 
4738     /* add received vals into ba */
4739     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4740       /* i-th row */
4741       if (i == *nextrow[k]) {
4742         anzi   = *(nextai[k] + 1) - *nextai[k];
4743         aj     = buf_rj[k] + *nextai[k];
4744         aa     = abuf_r[k] + *nextai[k];
4745         nextaj = 0;
4746         for (j = 0; nextaj < anzi; j++) {
4747           if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4748             ba_i[j] += aa[nextaj++];
4749           }
4750         }
4751         nextrow[k]++;
4752         nextai[k]++;
4753       }
4754     }
4755     PetscCall(MatSetValues(mpimat, 1, &arow, bnzi, bj_i, ba_i, INSERT_VALUES));
4756   }
4757   PetscCall(MatSeqAIJRestoreArrayRead(seqmat, &a_a));
4758   PetscCall(MatAssemblyBegin(mpimat, MAT_FINAL_ASSEMBLY));
4759   PetscCall(MatAssemblyEnd(mpimat, MAT_FINAL_ASSEMBLY));
4760 
4761   PetscCall(PetscFree(abuf_r[0]));
4762   PetscCall(PetscFree(abuf_r));
4763   PetscCall(PetscFree(ba_i));
4764   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4765   PetscCall(PetscLogEventEnd(MAT_Seqstompinum, seqmat, 0, 0, 0));
4766   PetscFunctionReturn(PETSC_SUCCESS);
4767 }
4768 
4769 PetscErrorCode MatCreateMPIAIJSumSeqAIJSymbolic(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, Mat *mpimat)
4770 {
4771   Mat                B_mpi;
4772   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)seqmat->data;
4773   PetscMPIInt        size, rank, tagi, tagj, *len_s, *len_si, *len_ri;
4774   PetscInt         **buf_rj, **buf_ri, **buf_ri_k;
4775   PetscInt           M = seqmat->rmap->n, N = seqmat->cmap->n, i, *owners, *ai = a->i, *aj = a->j;
4776   PetscInt           len, *dnz, *onz, bs, cbs;
4777   PetscInt           k, anzi, *bi, *bj, *lnk, nlnk, arow, bnzi;
4778   PetscInt           nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextai;
4779   MPI_Request       *si_waits, *sj_waits, *ri_waits, *rj_waits;
4780   MPI_Status        *status;
4781   PetscFreeSpaceList free_space = NULL, current_space = NULL;
4782   PetscBT            lnkbt;
4783   MatMergeSeqsToMPI *merge;
4784   PetscContainer     container;
4785 
4786   PetscFunctionBegin;
4787   PetscCall(PetscLogEventBegin(MAT_Seqstompisym, seqmat, 0, 0, 0));
4788 
4789   /* make sure it is a PETSc comm */
4790   PetscCall(PetscCommDuplicate(comm, &comm, NULL));
4791   PetscCallMPI(MPI_Comm_size(comm, &size));
4792   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4793 
4794   PetscCall(PetscNew(&merge));
4795   PetscCall(PetscMalloc1(size, &status));
4796 
4797   /* determine row ownership */
4798   PetscCall(PetscLayoutCreate(comm, &merge->rowmap));
4799   PetscCall(PetscLayoutSetLocalSize(merge->rowmap, m));
4800   PetscCall(PetscLayoutSetSize(merge->rowmap, M));
4801   PetscCall(PetscLayoutSetBlockSize(merge->rowmap, 1));
4802   PetscCall(PetscLayoutSetUp(merge->rowmap));
4803   PetscCall(PetscMalloc1(size, &len_si));
4804   PetscCall(PetscMalloc1(size, &merge->len_s));
4805 
4806   m      = merge->rowmap->n;
4807   owners = merge->rowmap->range;
4808 
4809   /* determine the number of messages to send, their lengths */
4810   len_s = merge->len_s;
4811 
4812   len          = 0; /* length of buf_si[] */
4813   merge->nsend = 0;
4814   for (PetscMPIInt proc = 0; proc < size; proc++) {
4815     len_si[proc] = 0;
4816     if (proc == rank) {
4817       len_s[proc] = 0;
4818     } else {
4819       PetscCall(PetscMPIIntCast(owners[proc + 1] - owners[proc] + 1, &len_si[proc]));
4820       PetscCall(PetscMPIIntCast(ai[owners[proc + 1]] - ai[owners[proc]], &len_s[proc])); /* num of rows to be sent to [proc] */
4821     }
4822     if (len_s[proc]) {
4823       merge->nsend++;
4824       nrows = 0;
4825       for (i = owners[proc]; i < owners[proc + 1]; i++) {
4826         if (ai[i + 1] > ai[i]) nrows++;
4827       }
4828       PetscCall(PetscMPIIntCast(2 * (nrows + 1), &len_si[proc]));
4829       len += len_si[proc];
4830     }
4831   }
4832 
4833   /* determine the number and length of messages to receive for ij-structure */
4834   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv));
4835   PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri));
4836 
4837   /* post the Irecv of j-structure */
4838   PetscCall(PetscCommGetNewTag(comm, &tagj));
4839   PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rj_waits));
4840 
4841   /* post the Isend of j-structure */
4842   PetscCall(PetscMalloc2(merge->nsend, &si_waits, merge->nsend, &sj_waits));
4843 
4844   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4845     if (!len_s[proc]) continue;
4846     i = owners[proc];
4847     PetscCallMPI(MPIU_Isend(aj + ai[i], len_s[proc], MPIU_INT, proc, tagj, comm, sj_waits + k));
4848     k++;
4849   }
4850 
4851   /* receives and sends of j-structure are complete */
4852   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, rj_waits, status));
4853   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, sj_waits, status));
4854 
4855   /* send and recv i-structure */
4856   PetscCall(PetscCommGetNewTag(comm, &tagi));
4857   PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &ri_waits));
4858 
4859   PetscCall(PetscMalloc1(len + 1, &buf_s));
4860   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
4861   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4862     if (!len_s[proc]) continue;
4863     /* form outgoing message for i-structure:
4864          buf_si[0]:                 nrows to be sent
4865                [1:nrows]:           row index (global)
4866                [nrows+1:2*nrows+1]: i-structure index
4867     */
4868     nrows       = len_si[proc] / 2 - 1;
4869     buf_si_i    = buf_si + nrows + 1;
4870     buf_si[0]   = nrows;
4871     buf_si_i[0] = 0;
4872     nrows       = 0;
4873     for (i = owners[proc]; i < owners[proc + 1]; i++) {
4874       anzi = ai[i + 1] - ai[i];
4875       if (anzi) {
4876         buf_si_i[nrows + 1] = buf_si_i[nrows] + anzi; /* i-structure */
4877         buf_si[nrows + 1]   = i - owners[proc];       /* local row index */
4878         nrows++;
4879       }
4880     }
4881     PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, si_waits + k));
4882     k++;
4883     buf_si += len_si[proc];
4884   }
4885 
4886   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, ri_waits, status));
4887   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, si_waits, status));
4888 
4889   PetscCall(PetscInfo(seqmat, "nsend: %d, nrecv: %d\n", merge->nsend, merge->nrecv));
4890   for (i = 0; i < merge->nrecv; i++) PetscCall(PetscInfo(seqmat, "recv len_ri=%d, len_rj=%d from [%d]\n", len_ri[i], merge->len_r[i], merge->id_r[i]));
4891 
4892   PetscCall(PetscFree(len_si));
4893   PetscCall(PetscFree(len_ri));
4894   PetscCall(PetscFree(rj_waits));
4895   PetscCall(PetscFree2(si_waits, sj_waits));
4896   PetscCall(PetscFree(ri_waits));
4897   PetscCall(PetscFree(buf_s));
4898   PetscCall(PetscFree(status));
4899 
4900   /* compute a local seq matrix in each processor */
4901   /* allocate bi array and free space for accumulating nonzero column info */
4902   PetscCall(PetscMalloc1(m + 1, &bi));
4903   bi[0] = 0;
4904 
4905   /* create and initialize a linked list */
4906   nlnk = N + 1;
4907   PetscCall(PetscLLCreate(N, N, nlnk, lnk, lnkbt));
4908 
4909   /* initial FreeSpace size is 2*(num of local nnz(seqmat)) */
4910   len = ai[owners[rank + 1]] - ai[owners[rank]];
4911   PetscCall(PetscFreeSpaceGet(PetscIntMultTruncate(2, len) + 1, &free_space));
4912 
4913   current_space = free_space;
4914 
4915   /* determine symbolic info for each local row */
4916   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4917 
4918   for (k = 0; k < merge->nrecv; k++) {
4919     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4920     nrows       = *buf_ri_k[k];
4921     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4922     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4923   }
4924 
4925   MatPreallocateBegin(comm, m, n, dnz, onz);
4926   len = 0;
4927   for (i = 0; i < m; i++) {
4928     bnzi = 0;
4929     /* add local non-zero cols of this proc's seqmat into lnk */
4930     arow = owners[rank] + i;
4931     anzi = ai[arow + 1] - ai[arow];
4932     aj   = a->j + ai[arow];
4933     PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4934     bnzi += nlnk;
4935     /* add received col data into lnk */
4936     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4937       if (i == *nextrow[k]) {            /* i-th row */
4938         anzi = *(nextai[k] + 1) - *nextai[k];
4939         aj   = buf_rj[k] + *nextai[k];
4940         PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4941         bnzi += nlnk;
4942         nextrow[k]++;
4943         nextai[k]++;
4944       }
4945     }
4946     if (len < bnzi) len = bnzi; /* =max(bnzi) */
4947 
4948     /* if free space is not available, make more free space */
4949     if (current_space->local_remaining < bnzi) PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(bnzi, current_space->total_array_size), &current_space));
4950     /* copy data into free space, then initialize lnk */
4951     PetscCall(PetscLLClean(N, N, bnzi, lnk, current_space->array, lnkbt));
4952     PetscCall(MatPreallocateSet(i + owners[rank], bnzi, current_space->array, dnz, onz));
4953 
4954     current_space->array += bnzi;
4955     current_space->local_used += bnzi;
4956     current_space->local_remaining -= bnzi;
4957 
4958     bi[i + 1] = bi[i] + bnzi;
4959   }
4960 
4961   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4962 
4963   PetscCall(PetscMalloc1(bi[m], &bj));
4964   PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
4965   PetscCall(PetscLLDestroy(lnk, lnkbt));
4966 
4967   /* create symbolic parallel matrix B_mpi */
4968   PetscCall(MatGetBlockSizes(seqmat, &bs, &cbs));
4969   PetscCall(MatCreate(comm, &B_mpi));
4970   if (n == PETSC_DECIDE) {
4971     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, N));
4972   } else {
4973     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4974   }
4975   PetscCall(MatSetBlockSizes(B_mpi, bs, cbs));
4976   PetscCall(MatSetType(B_mpi, MATMPIAIJ));
4977   PetscCall(MatMPIAIJSetPreallocation(B_mpi, 0, dnz, 0, onz));
4978   MatPreallocateEnd(dnz, onz);
4979   PetscCall(MatSetOption(B_mpi, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
4980 
4981   /* B_mpi is not ready for use - assembly will be done by MatCreateMPIAIJSumSeqAIJNumeric() */
4982   B_mpi->assembled = PETSC_FALSE;
4983   merge->bi        = bi;
4984   merge->bj        = bj;
4985   merge->buf_ri    = buf_ri;
4986   merge->buf_rj    = buf_rj;
4987   merge->coi       = NULL;
4988   merge->coj       = NULL;
4989   merge->owners_co = NULL;
4990 
4991   PetscCall(PetscCommDestroy(&comm));
4992 
4993   /* attach the supporting struct to B_mpi for reuse */
4994   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
4995   PetscCall(PetscContainerSetPointer(container, merge));
4996   PetscCall(PetscContainerSetCtxDestroy(container, MatMergeSeqsToMPIDestroy));
4997   PetscCall(PetscObjectCompose((PetscObject)B_mpi, "MatMergeSeqsToMPI", (PetscObject)container));
4998   PetscCall(PetscContainerDestroy(&container));
4999   *mpimat = B_mpi;
5000 
5001   PetscCall(PetscLogEventEnd(MAT_Seqstompisym, seqmat, 0, 0, 0));
5002   PetscFunctionReturn(PETSC_SUCCESS);
5003 }
5004 
5005 /*@
5006   MatCreateMPIAIJSumSeqAIJ - Creates a `MATMPIAIJ` matrix by adding sequential
5007   matrices from each processor
5008 
5009   Collective
5010 
5011   Input Parameters:
5012 + comm   - the communicators the parallel matrix will live on
5013 . seqmat - the input sequential matrices
5014 . m      - number of local rows (or `PETSC_DECIDE`)
5015 . n      - number of local columns (or `PETSC_DECIDE`)
5016 - scall  - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5017 
5018   Output Parameter:
5019 . mpimat - the parallel matrix generated
5020 
5021   Level: advanced
5022 
5023   Note:
5024   The dimensions of the sequential matrix in each processor MUST be the same.
5025   The input seqmat is included into the container `MatMergeSeqsToMPIDestroy`, and will be
5026   destroyed when `mpimat` is destroyed. Call `PetscObjectQuery()` to access `seqmat`.
5027 
5028 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJ()`
5029 @*/
5030 PetscErrorCode MatCreateMPIAIJSumSeqAIJ(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, MatReuse scall, Mat *mpimat)
5031 {
5032   PetscMPIInt size;
5033 
5034   PetscFunctionBegin;
5035   PetscCallMPI(MPI_Comm_size(comm, &size));
5036   if (size == 1) {
5037     PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5038     if (scall == MAT_INITIAL_MATRIX) {
5039       PetscCall(MatDuplicate(seqmat, MAT_COPY_VALUES, mpimat));
5040     } else {
5041       PetscCall(MatCopy(seqmat, *mpimat, SAME_NONZERO_PATTERN));
5042     }
5043     PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5044     PetscFunctionReturn(PETSC_SUCCESS);
5045   }
5046   PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5047   if (scall == MAT_INITIAL_MATRIX) PetscCall(MatCreateMPIAIJSumSeqAIJSymbolic(comm, seqmat, m, n, mpimat));
5048   PetscCall(MatCreateMPIAIJSumSeqAIJNumeric(seqmat, *mpimat));
5049   PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5050   PetscFunctionReturn(PETSC_SUCCESS);
5051 }
5052 
5053 /*@
5054   MatAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATAIJ` matrix.
5055 
5056   Not Collective
5057 
5058   Input Parameter:
5059 . A - the matrix
5060 
5061   Output Parameter:
5062 . A_loc - the local sequential matrix generated
5063 
5064   Level: developer
5065 
5066   Notes:
5067   The matrix is created by taking `A`'s local rows and putting them into a sequential matrix
5068   with `mlocal` rows and `n` columns. Where `mlocal` is obtained with `MatGetLocalSize()` and
5069   `n` is the global column count obtained with `MatGetSize()`
5070 
5071   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5072 
5073   For parallel matrices this creates an entirely new matrix. If the matrix is sequential it merely increases the reference count.
5074 
5075   Destroy the matrix with `MatDestroy()`
5076 
5077 .seealso: [](ch_matrices), `Mat`, `MatMPIAIJGetLocalMat()`
5078 @*/
5079 PetscErrorCode MatAIJGetLocalMat(Mat A, Mat *A_loc)
5080 {
5081   PetscBool mpi;
5082 
5083   PetscFunctionBegin;
5084   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &mpi));
5085   if (mpi) {
5086     PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, A_loc));
5087   } else {
5088     *A_loc = A;
5089     PetscCall(PetscObjectReference((PetscObject)*A_loc));
5090   }
5091   PetscFunctionReturn(PETSC_SUCCESS);
5092 }
5093 
5094 /*@
5095   MatMPIAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix.
5096 
5097   Not Collective
5098 
5099   Input Parameters:
5100 + A     - the matrix
5101 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5102 
5103   Output Parameter:
5104 . A_loc - the local sequential matrix generated
5105 
5106   Level: developer
5107 
5108   Notes:
5109   The matrix is created by taking all `A`'s local rows and putting them into a sequential
5110   matrix with `mlocal` rows and `n` columns.`mlocal` is the row count obtained with
5111   `MatGetLocalSize()` and `n` is the global column count obtained with `MatGetSize()`.
5112 
5113   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5114 
5115   When `A` is sequential and `MAT_INITIAL_MATRIX` is requested, the matrix returned is the diagonal part of `A` (which contains the entire matrix),
5116   with its reference count increased by one. Hence changing values of `A_loc` changes `A`. If `MAT_REUSE_MATRIX` is requested on a sequential matrix
5117   then `MatCopy`(Adiag,*`A_loc`,`SAME_NONZERO_PATTERN`) is called to fill `A_loc`. Thus one can preallocate the appropriate sequential matrix `A_loc`
5118   and then call this routine with `MAT_REUSE_MATRIX`. In this case, one can modify the values of `A_loc` without affecting the original sequential matrix.
5119 
5120 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMatCondensed()`, `MatMPIAIJGetLocalMatMerge()`
5121 @*/
5122 PetscErrorCode MatMPIAIJGetLocalMat(Mat A, MatReuse scall, Mat *A_loc)
5123 {
5124   Mat_MPIAIJ        *mpimat = (Mat_MPIAIJ *)A->data;
5125   Mat_SeqAIJ        *mat, *a, *b;
5126   PetscInt          *ai, *aj, *bi, *bj, *cmap = mpimat->garray;
5127   const PetscScalar *aa, *ba, *aav, *bav;
5128   PetscScalar       *ca, *cam;
5129   PetscMPIInt        size;
5130   PetscInt           am = A->rmap->n, i, j, k, cstart = A->cmap->rstart;
5131   PetscInt          *ci, *cj, col, ncols_d, ncols_o, jo;
5132   PetscBool          match;
5133 
5134   PetscFunctionBegin;
5135   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &match));
5136   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5137   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5138   if (size == 1) {
5139     if (scall == MAT_INITIAL_MATRIX) {
5140       PetscCall(PetscObjectReference((PetscObject)mpimat->A));
5141       *A_loc = mpimat->A;
5142     } else if (scall == MAT_REUSE_MATRIX) {
5143       PetscCall(MatCopy(mpimat->A, *A_loc, SAME_NONZERO_PATTERN));
5144     }
5145     PetscFunctionReturn(PETSC_SUCCESS);
5146   }
5147 
5148   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5149   a  = (Mat_SeqAIJ *)mpimat->A->data;
5150   b  = (Mat_SeqAIJ *)mpimat->B->data;
5151   ai = a->i;
5152   aj = a->j;
5153   bi = b->i;
5154   bj = b->j;
5155   PetscCall(MatSeqAIJGetArrayRead(mpimat->A, &aav));
5156   PetscCall(MatSeqAIJGetArrayRead(mpimat->B, &bav));
5157   aa = aav;
5158   ba = bav;
5159   if (scall == MAT_INITIAL_MATRIX) {
5160     PetscCall(PetscMalloc1(1 + am, &ci));
5161     ci[0] = 0;
5162     for (i = 0; i < am; i++) ci[i + 1] = ci[i] + (ai[i + 1] - ai[i]) + (bi[i + 1] - bi[i]);
5163     PetscCall(PetscMalloc1(1 + ci[am], &cj));
5164     PetscCall(PetscMalloc1(1 + ci[am], &ca));
5165     k = 0;
5166     for (i = 0; i < am; i++) {
5167       ncols_o = bi[i + 1] - bi[i];
5168       ncols_d = ai[i + 1] - ai[i];
5169       /* off-diagonal portion of A */
5170       for (jo = 0; jo < ncols_o; jo++) {
5171         col = cmap[*bj];
5172         if (col >= cstart) break;
5173         cj[k] = col;
5174         bj++;
5175         ca[k++] = *ba++;
5176       }
5177       /* diagonal portion of A */
5178       for (j = 0; j < ncols_d; j++) {
5179         cj[k]   = cstart + *aj++;
5180         ca[k++] = *aa++;
5181       }
5182       /* off-diagonal portion of A */
5183       for (j = jo; j < ncols_o; j++) {
5184         cj[k]   = cmap[*bj++];
5185         ca[k++] = *ba++;
5186       }
5187     }
5188     /* put together the new matrix */
5189     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, A->cmap->N, ci, cj, ca, A_loc));
5190     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5191     /* Since these are PETSc arrays, change flags to free them as necessary. */
5192     mat          = (Mat_SeqAIJ *)(*A_loc)->data;
5193     mat->free_a  = PETSC_TRUE;
5194     mat->free_ij = PETSC_TRUE;
5195     mat->nonew   = 0;
5196   } else if (scall == MAT_REUSE_MATRIX) {
5197     mat = (Mat_SeqAIJ *)(*A_loc)->data;
5198     ci  = mat->i;
5199     cj  = mat->j;
5200     PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &cam));
5201     for (i = 0; i < am; i++) {
5202       /* off-diagonal portion of A */
5203       ncols_o = bi[i + 1] - bi[i];
5204       for (jo = 0; jo < ncols_o; jo++) {
5205         col = cmap[*bj];
5206         if (col >= cstart) break;
5207         *cam++ = *ba++;
5208         bj++;
5209       }
5210       /* diagonal portion of A */
5211       ncols_d = ai[i + 1] - ai[i];
5212       for (j = 0; j < ncols_d; j++) *cam++ = *aa++;
5213       /* off-diagonal portion of A */
5214       for (j = jo; j < ncols_o; j++) {
5215         *cam++ = *ba++;
5216         bj++;
5217       }
5218     }
5219     PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &cam));
5220   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5221   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->A, &aav));
5222   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->B, &bav));
5223   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5224   PetscFunctionReturn(PETSC_SUCCESS);
5225 }
5226 
5227 /*@
5228   MatMPIAIJGetLocalMatMerge - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix by taking all its local rows and putting them into a sequential matrix with
5229   mlocal rows and n columns. Where n is the sum of the number of columns of the diagonal and off-diagonal part
5230 
5231   Not Collective
5232 
5233   Input Parameters:
5234 + A     - the matrix
5235 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5236 
5237   Output Parameters:
5238 + glob  - sequential `IS` with global indices associated with the columns of the local sequential matrix generated (can be `NULL`)
5239 - A_loc - the local sequential matrix generated
5240 
5241   Level: developer
5242 
5243   Note:
5244   This is different from `MatMPIAIJGetLocalMat()` since the first columns in the returning matrix are those associated with the diagonal
5245   part, then those associated with the off-diagonal part (in its local ordering)
5246 
5247 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`
5248 @*/
5249 PetscErrorCode MatMPIAIJGetLocalMatMerge(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
5250 {
5251   Mat             Ao, Ad;
5252   const PetscInt *cmap;
5253   PetscMPIInt     size;
5254   PetscErrorCode (*f)(Mat, MatReuse, IS *, Mat *);
5255 
5256   PetscFunctionBegin;
5257   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
5258   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5259   if (size == 1) {
5260     if (scall == MAT_INITIAL_MATRIX) {
5261       PetscCall(PetscObjectReference((PetscObject)Ad));
5262       *A_loc = Ad;
5263     } else if (scall == MAT_REUSE_MATRIX) {
5264       PetscCall(MatCopy(Ad, *A_loc, SAME_NONZERO_PATTERN));
5265     }
5266     if (glob) PetscCall(ISCreateStride(PetscObjectComm((PetscObject)Ad), Ad->cmap->n, Ad->cmap->rstart, 1, glob));
5267     PetscFunctionReturn(PETSC_SUCCESS);
5268   }
5269   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", &f));
5270   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5271   if (f) {
5272     PetscCall((*f)(A, scall, glob, A_loc));
5273   } else {
5274     Mat_SeqAIJ        *a = (Mat_SeqAIJ *)Ad->data;
5275     Mat_SeqAIJ        *b = (Mat_SeqAIJ *)Ao->data;
5276     Mat_SeqAIJ        *c;
5277     PetscInt          *ai = a->i, *aj = a->j;
5278     PetscInt          *bi = b->i, *bj = b->j;
5279     PetscInt          *ci, *cj;
5280     const PetscScalar *aa, *ba;
5281     PetscScalar       *ca;
5282     PetscInt           i, j, am, dn, on;
5283 
5284     PetscCall(MatGetLocalSize(Ad, &am, &dn));
5285     PetscCall(MatGetLocalSize(Ao, NULL, &on));
5286     PetscCall(MatSeqAIJGetArrayRead(Ad, &aa));
5287     PetscCall(MatSeqAIJGetArrayRead(Ao, &ba));
5288     if (scall == MAT_INITIAL_MATRIX) {
5289       PetscInt k;
5290       PetscCall(PetscMalloc1(1 + am, &ci));
5291       PetscCall(PetscMalloc1(ai[am] + bi[am], &cj));
5292       PetscCall(PetscMalloc1(ai[am] + bi[am], &ca));
5293       ci[0] = 0;
5294       for (i = 0, k = 0; i < am; i++) {
5295         const PetscInt ncols_o = bi[i + 1] - bi[i];
5296         const PetscInt ncols_d = ai[i + 1] - ai[i];
5297         ci[i + 1]              = ci[i] + ncols_o + ncols_d;
5298         /* diagonal portion of A */
5299         for (j = 0; j < ncols_d; j++, k++) {
5300           cj[k] = *aj++;
5301           ca[k] = *aa++;
5302         }
5303         /* off-diagonal portion of A */
5304         for (j = 0; j < ncols_o; j++, k++) {
5305           cj[k] = dn + *bj++;
5306           ca[k] = *ba++;
5307         }
5308       }
5309       /* put together the new matrix */
5310       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, dn + on, ci, cj, ca, A_loc));
5311       /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5312       /* Since these are PETSc arrays, change flags to free them as necessary. */
5313       c          = (Mat_SeqAIJ *)(*A_loc)->data;
5314       c->free_a  = PETSC_TRUE;
5315       c->free_ij = PETSC_TRUE;
5316       c->nonew   = 0;
5317       PetscCall(MatSetType(*A_loc, ((PetscObject)Ad)->type_name));
5318     } else if (scall == MAT_REUSE_MATRIX) {
5319       PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &ca));
5320       for (i = 0; i < am; i++) {
5321         const PetscInt ncols_d = ai[i + 1] - ai[i];
5322         const PetscInt ncols_o = bi[i + 1] - bi[i];
5323         /* diagonal portion of A */
5324         for (j = 0; j < ncols_d; j++) *ca++ = *aa++;
5325         /* off-diagonal portion of A */
5326         for (j = 0; j < ncols_o; j++) *ca++ = *ba++;
5327       }
5328       PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &ca));
5329     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5330     PetscCall(MatSeqAIJRestoreArrayRead(Ad, &aa));
5331     PetscCall(MatSeqAIJRestoreArrayRead(Ao, &aa));
5332     if (glob) {
5333       PetscInt cst, *gidx;
5334 
5335       PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
5336       PetscCall(PetscMalloc1(dn + on, &gidx));
5337       for (i = 0; i < dn; i++) gidx[i] = cst + i;
5338       for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
5339       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
5340     }
5341   }
5342   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5343   PetscFunctionReturn(PETSC_SUCCESS);
5344 }
5345 
5346 /*@C
5347   MatMPIAIJGetLocalMatCondensed - Creates a `MATSEQAIJ` matrix from an `MATMPIAIJ` matrix by taking all its local rows and NON-ZERO columns
5348 
5349   Not Collective
5350 
5351   Input Parameters:
5352 + A     - the matrix
5353 . scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5354 . row   - index set of rows to extract (or `NULL`)
5355 - col   - index set of columns to extract (or `NULL`)
5356 
5357   Output Parameter:
5358 . A_loc - the local sequential matrix generated
5359 
5360   Level: developer
5361 
5362 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`
5363 @*/
5364 PetscErrorCode MatMPIAIJGetLocalMatCondensed(Mat A, MatReuse scall, IS *row, IS *col, Mat *A_loc)
5365 {
5366   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5367   PetscInt    i, start, end, ncols, nzA, nzB, *cmap, imark, *idx;
5368   IS          isrowa, iscola;
5369   Mat        *aloc;
5370   PetscBool   match;
5371 
5372   PetscFunctionBegin;
5373   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &match));
5374   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5375   PetscCall(PetscLogEventBegin(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5376   if (!row) {
5377     start = A->rmap->rstart;
5378     end   = A->rmap->rend;
5379     PetscCall(ISCreateStride(PETSC_COMM_SELF, end - start, start, 1, &isrowa));
5380   } else {
5381     isrowa = *row;
5382   }
5383   if (!col) {
5384     start = A->cmap->rstart;
5385     cmap  = a->garray;
5386     nzA   = a->A->cmap->n;
5387     nzB   = a->B->cmap->n;
5388     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5389     ncols = 0;
5390     for (i = 0; i < nzB; i++) {
5391       if (cmap[i] < start) idx[ncols++] = cmap[i];
5392       else break;
5393     }
5394     imark = i;
5395     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;
5396     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i];
5397     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &iscola));
5398   } else {
5399     iscola = *col;
5400   }
5401   if (scall != MAT_INITIAL_MATRIX) {
5402     PetscCall(PetscMalloc1(1, &aloc));
5403     aloc[0] = *A_loc;
5404   }
5405   PetscCall(MatCreateSubMatrices(A, 1, &isrowa, &iscola, scall, &aloc));
5406   if (!col) { /* attach global id of condensed columns */
5407     PetscCall(PetscObjectCompose((PetscObject)aloc[0], "_petsc_GetLocalMatCondensed_iscol", (PetscObject)iscola));
5408   }
5409   *A_loc = aloc[0];
5410   PetscCall(PetscFree(aloc));
5411   if (!row) PetscCall(ISDestroy(&isrowa));
5412   if (!col) PetscCall(ISDestroy(&iscola));
5413   PetscCall(PetscLogEventEnd(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5414   PetscFunctionReturn(PETSC_SUCCESS);
5415 }
5416 
5417 /*
5418  * Create a sequential AIJ matrix based on row indices. a whole column is extracted once a row is matched.
5419  * Row could be local or remote.The routine is designed to be scalable in memory so that nothing is based
5420  * on a global size.
5421  * */
5422 static PetscErrorCode MatCreateSeqSubMatrixWithRows_Private(Mat P, IS rows, Mat *P_oth)
5423 {
5424   Mat_MPIAIJ            *p  = (Mat_MPIAIJ *)P->data;
5425   Mat_SeqAIJ            *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data, *p_oth;
5426   PetscInt               plocalsize, nrows, *ilocal, *oilocal, i, lidx, *nrcols, *nlcols, ncol;
5427   PetscMPIInt            owner;
5428   PetscSFNode           *iremote, *oiremote;
5429   const PetscInt        *lrowindices;
5430   PetscSF                sf, osf;
5431   PetscInt               pcstart, *roffsets, *loffsets, *pnnz, j;
5432   PetscInt               ontotalcols, dntotalcols, ntotalcols, nout;
5433   MPI_Comm               comm;
5434   ISLocalToGlobalMapping mapping;
5435   const PetscScalar     *pd_a, *po_a;
5436 
5437   PetscFunctionBegin;
5438   PetscCall(PetscObjectGetComm((PetscObject)P, &comm));
5439   /* plocalsize is the number of roots
5440    * nrows is the number of leaves
5441    * */
5442   PetscCall(MatGetLocalSize(P, &plocalsize, NULL));
5443   PetscCall(ISGetLocalSize(rows, &nrows));
5444   PetscCall(PetscCalloc1(nrows, &iremote));
5445   PetscCall(ISGetIndices(rows, &lrowindices));
5446   for (i = 0; i < nrows; i++) {
5447     /* Find a remote index and an owner for a row
5448      * The row could be local or remote
5449      * */
5450     owner = 0;
5451     lidx  = 0;
5452     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, &lidx));
5453     iremote[i].index = lidx;
5454     iremote[i].rank  = owner;
5455   }
5456   /* Create SF to communicate how many nonzero columns for each row */
5457   PetscCall(PetscSFCreate(comm, &sf));
5458   /* SF will figure out the number of nonzero columns for each row, and their
5459    * offsets
5460    * */
5461   PetscCall(PetscSFSetGraph(sf, plocalsize, nrows, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5462   PetscCall(PetscSFSetFromOptions(sf));
5463   PetscCall(PetscSFSetUp(sf));
5464 
5465   PetscCall(PetscCalloc1(2 * (plocalsize + 1), &roffsets));
5466   PetscCall(PetscCalloc1(2 * plocalsize, &nrcols));
5467   PetscCall(PetscCalloc1(nrows, &pnnz));
5468   roffsets[0] = 0;
5469   roffsets[1] = 0;
5470   for (i = 0; i < plocalsize; i++) {
5471     /* diagonal */
5472     nrcols[i * 2 + 0] = pd->i[i + 1] - pd->i[i];
5473     /* off-diagonal */
5474     nrcols[i * 2 + 1] = po->i[i + 1] - po->i[i];
5475     /* compute offsets so that we relative location for each row */
5476     roffsets[(i + 1) * 2 + 0] = roffsets[i * 2 + 0] + nrcols[i * 2 + 0];
5477     roffsets[(i + 1) * 2 + 1] = roffsets[i * 2 + 1] + nrcols[i * 2 + 1];
5478   }
5479   PetscCall(PetscCalloc1(2 * nrows, &nlcols));
5480   PetscCall(PetscCalloc1(2 * nrows, &loffsets));
5481   /* 'r' means root, and 'l' means leaf */
5482   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5483   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5484   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5485   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5486   PetscCall(PetscSFDestroy(&sf));
5487   PetscCall(PetscFree(roffsets));
5488   PetscCall(PetscFree(nrcols));
5489   dntotalcols = 0;
5490   ontotalcols = 0;
5491   ncol        = 0;
5492   for (i = 0; i < nrows; i++) {
5493     pnnz[i] = nlcols[i * 2 + 0] + nlcols[i * 2 + 1];
5494     ncol    = PetscMax(pnnz[i], ncol);
5495     /* diagonal */
5496     dntotalcols += nlcols[i * 2 + 0];
5497     /* off-diagonal */
5498     ontotalcols += nlcols[i * 2 + 1];
5499   }
5500   /* We do not need to figure the right number of columns
5501    * since all the calculations will be done by going through the raw data
5502    * */
5503   PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, nrows, ncol, 0, pnnz, P_oth));
5504   PetscCall(MatSetUp(*P_oth));
5505   PetscCall(PetscFree(pnnz));
5506   p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5507   /* diagonal */
5508   PetscCall(PetscCalloc1(dntotalcols, &iremote));
5509   /* off-diagonal */
5510   PetscCall(PetscCalloc1(ontotalcols, &oiremote));
5511   /* diagonal */
5512   PetscCall(PetscCalloc1(dntotalcols, &ilocal));
5513   /* off-diagonal */
5514   PetscCall(PetscCalloc1(ontotalcols, &oilocal));
5515   dntotalcols = 0;
5516   ontotalcols = 0;
5517   ntotalcols  = 0;
5518   for (i = 0; i < nrows; i++) {
5519     owner = 0;
5520     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, NULL));
5521     /* Set iremote for diag matrix */
5522     for (j = 0; j < nlcols[i * 2 + 0]; j++) {
5523       iremote[dntotalcols].index = loffsets[i * 2 + 0] + j;
5524       iremote[dntotalcols].rank  = owner;
5525       /* P_oth is seqAIJ so that ilocal need to point to the first part of memory */
5526       ilocal[dntotalcols++] = ntotalcols++;
5527     }
5528     /* off-diagonal */
5529     for (j = 0; j < nlcols[i * 2 + 1]; j++) {
5530       oiremote[ontotalcols].index = loffsets[i * 2 + 1] + j;
5531       oiremote[ontotalcols].rank  = owner;
5532       oilocal[ontotalcols++]      = ntotalcols++;
5533     }
5534   }
5535   PetscCall(ISRestoreIndices(rows, &lrowindices));
5536   PetscCall(PetscFree(loffsets));
5537   PetscCall(PetscFree(nlcols));
5538   PetscCall(PetscSFCreate(comm, &sf));
5539   /* P serves as roots and P_oth is leaves
5540    * Diag matrix
5541    * */
5542   PetscCall(PetscSFSetGraph(sf, pd->i[plocalsize], dntotalcols, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5543   PetscCall(PetscSFSetFromOptions(sf));
5544   PetscCall(PetscSFSetUp(sf));
5545 
5546   PetscCall(PetscSFCreate(comm, &osf));
5547   /* off-diagonal */
5548   PetscCall(PetscSFSetGraph(osf, po->i[plocalsize], ontotalcols, oilocal, PETSC_OWN_POINTER, oiremote, PETSC_OWN_POINTER));
5549   PetscCall(PetscSFSetFromOptions(osf));
5550   PetscCall(PetscSFSetUp(osf));
5551   PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5552   PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5553   /* operate on the matrix internal data to save memory */
5554   PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5555   PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5556   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, NULL));
5557   /* Convert to global indices for diag matrix */
5558   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] += pcstart;
5559   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5560   /* We want P_oth store global indices */
5561   PetscCall(ISLocalToGlobalMappingCreate(comm, 1, p->B->cmap->n, p->garray, PETSC_COPY_VALUES, &mapping));
5562   /* Use memory scalable approach */
5563   PetscCall(ISLocalToGlobalMappingSetType(mapping, ISLOCALTOGLOBALMAPPINGHASH));
5564   PetscCall(ISLocalToGlobalMappingApply(mapping, po->i[plocalsize], po->j, po->j));
5565   PetscCall(PetscSFBcastBegin(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5566   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5567   /* Convert back to local indices */
5568   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] -= pcstart;
5569   PetscCall(PetscSFBcastEnd(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5570   nout = 0;
5571   PetscCall(ISGlobalToLocalMappingApply(mapping, IS_GTOLM_DROP, po->i[plocalsize], po->j, &nout, po->j));
5572   PetscCheck(nout == po->i[plocalsize], comm, PETSC_ERR_ARG_INCOMP, "n %" PetscInt_FMT " does not equal to nout %" PetscInt_FMT " ", po->i[plocalsize], nout);
5573   PetscCall(ISLocalToGlobalMappingDestroy(&mapping));
5574   /* Exchange values */
5575   PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5576   PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5577   PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5578   PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5579   /* Stop PETSc from shrinking memory */
5580   for (i = 0; i < nrows; i++) p_oth->ilen[i] = p_oth->imax[i];
5581   PetscCall(MatAssemblyBegin(*P_oth, MAT_FINAL_ASSEMBLY));
5582   PetscCall(MatAssemblyEnd(*P_oth, MAT_FINAL_ASSEMBLY));
5583   /* Attach PetscSF objects to P_oth so that we can reuse it later */
5584   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "diagsf", (PetscObject)sf));
5585   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "offdiagsf", (PetscObject)osf));
5586   PetscCall(PetscSFDestroy(&sf));
5587   PetscCall(PetscSFDestroy(&osf));
5588   PetscFunctionReturn(PETSC_SUCCESS);
5589 }
5590 
5591 /*
5592  * Creates a SeqAIJ matrix by taking rows of B that equal to nonzero columns of local A
5593  * This supports MPIAIJ and MAIJ
5594  * */
5595 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat A, Mat P, PetscInt dof, MatReuse reuse, Mat *P_oth)
5596 {
5597   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5598   Mat_SeqAIJ *p_oth;
5599   IS          rows, map;
5600   PetscHMapI  hamp;
5601   PetscInt    i, htsize, *rowindices, off, *mapping, key, count;
5602   MPI_Comm    comm;
5603   PetscSF     sf, osf;
5604   PetscBool   has;
5605 
5606   PetscFunctionBegin;
5607   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5608   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, P, 0, 0));
5609   /* If it is the first time, create an index set of off-diag nonzero columns of A,
5610    *  and then create a submatrix (that often is an overlapping matrix)
5611    * */
5612   if (reuse == MAT_INITIAL_MATRIX) {
5613     /* Use a hash table to figure out unique keys */
5614     PetscCall(PetscHMapICreateWithSize(a->B->cmap->n, &hamp));
5615     PetscCall(PetscCalloc1(a->B->cmap->n, &mapping));
5616     count = 0;
5617     /* Assume that  a->g is sorted, otherwise the following does not make sense */
5618     for (i = 0; i < a->B->cmap->n; i++) {
5619       key = a->garray[i] / dof;
5620       PetscCall(PetscHMapIHas(hamp, key, &has));
5621       if (!has) {
5622         mapping[i] = count;
5623         PetscCall(PetscHMapISet(hamp, key, count++));
5624       } else {
5625         /* Current 'i' has the same value the previous step */
5626         mapping[i] = count - 1;
5627       }
5628     }
5629     PetscCall(ISCreateGeneral(comm, a->B->cmap->n, mapping, PETSC_OWN_POINTER, &map));
5630     PetscCall(PetscHMapIGetSize(hamp, &htsize));
5631     PetscCheck(htsize == count, comm, PETSC_ERR_ARG_INCOMP, " Size of hash map %" PetscInt_FMT " is inconsistent with count %" PetscInt_FMT, htsize, count);
5632     PetscCall(PetscCalloc1(htsize, &rowindices));
5633     off = 0;
5634     PetscCall(PetscHMapIGetKeys(hamp, &off, rowindices));
5635     PetscCall(PetscHMapIDestroy(&hamp));
5636     PetscCall(PetscSortInt(htsize, rowindices));
5637     PetscCall(ISCreateGeneral(comm, htsize, rowindices, PETSC_OWN_POINTER, &rows));
5638     /* In case, the matrix was already created but users want to recreate the matrix */
5639     PetscCall(MatDestroy(P_oth));
5640     PetscCall(MatCreateSeqSubMatrixWithRows_Private(P, rows, P_oth));
5641     PetscCall(PetscObjectCompose((PetscObject)*P_oth, "aoffdiagtopothmapping", (PetscObject)map));
5642     PetscCall(ISDestroy(&map));
5643     PetscCall(ISDestroy(&rows));
5644   } else if (reuse == MAT_REUSE_MATRIX) {
5645     /* If matrix was already created, we simply update values using SF objects
5646      * that as attached to the matrix earlier.
5647      */
5648     const PetscScalar *pd_a, *po_a;
5649 
5650     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "diagsf", (PetscObject *)&sf));
5651     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "offdiagsf", (PetscObject *)&osf));
5652     PetscCheck(sf && osf, comm, PETSC_ERR_ARG_NULL, "Matrix is not initialized yet");
5653     p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5654     /* Update values in place */
5655     PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5656     PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5657     PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5658     PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5659     PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5660     PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5661     PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5662     PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5663   } else SETERRQ(comm, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unknown reuse type");
5664   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, P, 0, 0));
5665   PetscFunctionReturn(PETSC_SUCCESS);
5666 }
5667 
5668 /*@C
5669   MatGetBrowsOfAcols - Returns `IS` that contain rows of `B` that equal to nonzero columns of local `A`
5670 
5671   Collective
5672 
5673   Input Parameters:
5674 + A     - the first matrix in `MATMPIAIJ` format
5675 . B     - the second matrix in `MATMPIAIJ` format
5676 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5677 
5678   Output Parameters:
5679 + rowb  - On input index sets of rows of B to extract (or `NULL`), modified on output
5680 . colb  - On input index sets of columns of B to extract (or `NULL`), modified on output
5681 - B_seq - the sequential matrix generated
5682 
5683   Level: developer
5684 
5685 .seealso: `Mat`, `MATMPIAIJ`, `IS`, `MatReuse`
5686 @*/
5687 PetscErrorCode MatGetBrowsOfAcols(Mat A, Mat B, MatReuse scall, IS *rowb, IS *colb, Mat *B_seq)
5688 {
5689   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5690   PetscInt   *idx, i, start, ncols, nzA, nzB, *cmap, imark;
5691   IS          isrowb, iscolb;
5692   Mat        *bseq = NULL;
5693 
5694   PetscFunctionBegin;
5695   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5696              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5697   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAcols, A, B, 0, 0));
5698 
5699   if (scall == MAT_INITIAL_MATRIX) {
5700     start = A->cmap->rstart;
5701     cmap  = a->garray;
5702     nzA   = a->A->cmap->n;
5703     nzB   = a->B->cmap->n;
5704     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5705     ncols = 0;
5706     for (i = 0; i < nzB; i++) { /* row < local row index */
5707       if (cmap[i] < start) idx[ncols++] = cmap[i];
5708       else break;
5709     }
5710     imark = i;
5711     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;   /* local rows */
5712     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i]; /* row > local row index */
5713     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &isrowb));
5714     PetscCall(ISCreateStride(PETSC_COMM_SELF, B->cmap->N, 0, 1, &iscolb));
5715   } else {
5716     PetscCheck(rowb && colb, PETSC_COMM_SELF, PETSC_ERR_SUP, "IS rowb and colb must be provided for MAT_REUSE_MATRIX");
5717     isrowb = *rowb;
5718     iscolb = *colb;
5719     PetscCall(PetscMalloc1(1, &bseq));
5720     bseq[0] = *B_seq;
5721   }
5722   PetscCall(MatCreateSubMatrices(B, 1, &isrowb, &iscolb, scall, &bseq));
5723   *B_seq = bseq[0];
5724   PetscCall(PetscFree(bseq));
5725   if (!rowb) {
5726     PetscCall(ISDestroy(&isrowb));
5727   } else {
5728     *rowb = isrowb;
5729   }
5730   if (!colb) {
5731     PetscCall(ISDestroy(&iscolb));
5732   } else {
5733     *colb = iscolb;
5734   }
5735   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAcols, A, B, 0, 0));
5736   PetscFunctionReturn(PETSC_SUCCESS);
5737 }
5738 
5739 /*
5740     MatGetBrowsOfAoCols_MPIAIJ - Creates a `MATSEQAIJ` matrix by taking rows of B that equal to nonzero columns
5741     of the OFF-DIAGONAL portion of local A
5742 
5743     Collective
5744 
5745    Input Parameters:
5746 +    A,B - the matrices in `MATMPIAIJ` format
5747 -    scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5748 
5749    Output Parameter:
5750 +    startsj_s - starting point in B's sending j-arrays, saved for MAT_REUSE (or NULL)
5751 .    startsj_r - starting point in B's receiving j-arrays, saved for MAT_REUSE (or NULL)
5752 .    bufa_ptr - array for sending matrix values, saved for MAT_REUSE (or NULL)
5753 -    B_oth - the sequential matrix generated with size aBn=a->B->cmap->n by B->cmap->N
5754 
5755     Developer Note:
5756     This directly accesses information inside the VecScatter associated with the matrix-vector product
5757      for this matrix. This is not desirable..
5758 
5759     Level: developer
5760 
5761 */
5762 
5763 PetscErrorCode MatGetBrowsOfAoCols_MPIAIJ(Mat A, Mat B, MatReuse scall, PetscInt **startsj_s, PetscInt **startsj_r, MatScalar **bufa_ptr, Mat *B_oth)
5764 {
5765   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
5766   VecScatter         ctx;
5767   MPI_Comm           comm;
5768   const PetscMPIInt *rprocs, *sprocs;
5769   PetscMPIInt        nrecvs, nsends;
5770   const PetscInt    *srow, *rstarts, *sstarts;
5771   PetscInt          *rowlen, *bufj, *bufJ, ncols = 0, aBn = a->B->cmap->n, row, *b_othi, *b_othj, *rvalues = NULL, *svalues = NULL, *cols, sbs, rbs;
5772   PetscInt           i, j, k = 0, l, ll, nrows, *rstartsj = NULL, *sstartsj, len;
5773   PetscScalar       *b_otha, *bufa, *bufA, *vals = NULL;
5774   MPI_Request       *reqs = NULL, *rwaits = NULL, *swaits = NULL;
5775   PetscMPIInt        size, tag, rank, nreqs;
5776 
5777   PetscFunctionBegin;
5778   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5779   PetscCallMPI(MPI_Comm_size(comm, &size));
5780 
5781   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5782              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5783   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, B, 0, 0));
5784   PetscCallMPI(MPI_Comm_rank(comm, &rank));
5785 
5786   if (size == 1) {
5787     startsj_s = NULL;
5788     bufa_ptr  = NULL;
5789     *B_oth    = NULL;
5790     PetscFunctionReturn(PETSC_SUCCESS);
5791   }
5792 
5793   ctx = a->Mvctx;
5794   tag = ((PetscObject)ctx)->tag;
5795 
5796   PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &srow, &sprocs, &sbs));
5797   /* rprocs[] must be ordered so that indices received from them are ordered in rvalues[], which is key to algorithms used in this subroutine */
5798   PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL /*indices not needed*/, &rprocs, &rbs));
5799   PetscCall(PetscMPIIntCast(nsends + nrecvs, &nreqs));
5800   PetscCall(PetscMalloc1(nreqs, &reqs));
5801   rwaits = reqs;
5802   swaits = PetscSafePointerPlusOffset(reqs, nrecvs);
5803 
5804   if (!startsj_s || !bufa_ptr) scall = MAT_INITIAL_MATRIX;
5805   if (scall == MAT_INITIAL_MATRIX) {
5806     /* i-array */
5807     /*  post receives */
5808     if (nrecvs) PetscCall(PetscMalloc1(rbs * (rstarts[nrecvs] - rstarts[0]), &rvalues)); /* rstarts can be NULL when nrecvs=0 */
5809     for (i = 0; i < nrecvs; i++) {
5810       rowlen = rvalues + rstarts[i] * rbs;
5811       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of indices to be received */
5812       PetscCallMPI(MPIU_Irecv(rowlen, nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5813     }
5814 
5815     /* pack the outgoing message */
5816     PetscCall(PetscMalloc2(nsends + 1, &sstartsj, nrecvs + 1, &rstartsj));
5817 
5818     sstartsj[0] = 0;
5819     rstartsj[0] = 0;
5820     len         = 0; /* total length of j or a array to be sent */
5821     if (nsends) {
5822       k = sstarts[0]; /* ATTENTION: sstarts[0] and rstarts[0] are not necessarily zero */
5823       PetscCall(PetscMalloc1(sbs * (sstarts[nsends] - sstarts[0]), &svalues));
5824     }
5825     for (i = 0; i < nsends; i++) {
5826       rowlen = svalues + (sstarts[i] - sstarts[0]) * sbs;
5827       nrows  = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5828       for (j = 0; j < nrows; j++) {
5829         row = srow[k] + B->rmap->range[rank]; /* global row idx */
5830         for (l = 0; l < sbs; l++) {
5831           PetscCall(MatGetRow_MPIAIJ(B, row + l, &ncols, NULL, NULL)); /* rowlength */
5832 
5833           rowlen[j * sbs + l] = ncols;
5834 
5835           len += ncols;
5836           PetscCall(MatRestoreRow_MPIAIJ(B, row + l, &ncols, NULL, NULL));
5837         }
5838         k++;
5839       }
5840       PetscCallMPI(MPIU_Isend(rowlen, nrows * sbs, MPIU_INT, sprocs[i], tag, comm, swaits + i));
5841 
5842       sstartsj[i + 1] = len; /* starting point of (i+1)-th outgoing msg in bufj and bufa */
5843     }
5844     /* recvs and sends of i-array are completed */
5845     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5846     PetscCall(PetscFree(svalues));
5847 
5848     /* allocate buffers for sending j and a arrays */
5849     PetscCall(PetscMalloc1(len, &bufj));
5850     PetscCall(PetscMalloc1(len, &bufa));
5851 
5852     /* create i-array of B_oth */
5853     PetscCall(PetscMalloc1(aBn + 1, &b_othi));
5854 
5855     b_othi[0] = 0;
5856     len       = 0; /* total length of j or a array to be received */
5857     k         = 0;
5858     for (i = 0; i < nrecvs; i++) {
5859       rowlen = rvalues + (rstarts[i] - rstarts[0]) * rbs;
5860       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of rows to be received */
5861       for (j = 0; j < nrows; j++) {
5862         b_othi[k + 1] = b_othi[k] + rowlen[j];
5863         PetscCall(PetscIntSumError(rowlen[j], len, &len));
5864         k++;
5865       }
5866       rstartsj[i + 1] = len; /* starting point of (i+1)-th incoming msg in bufj and bufa */
5867     }
5868     PetscCall(PetscFree(rvalues));
5869 
5870     /* allocate space for j and a arrays of B_oth */
5871     PetscCall(PetscMalloc1(b_othi[aBn], &b_othj));
5872     PetscCall(PetscMalloc1(b_othi[aBn], &b_otha));
5873 
5874     /* j-array */
5875     /*  post receives of j-array */
5876     for (i = 0; i < nrecvs; i++) {
5877       nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5878       PetscCallMPI(MPIU_Irecv(PetscSafePointerPlusOffset(b_othj, rstartsj[i]), nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5879     }
5880 
5881     /* pack the outgoing message j-array */
5882     if (nsends) k = sstarts[0];
5883     for (i = 0; i < nsends; i++) {
5884       nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5885       bufJ  = PetscSafePointerPlusOffset(bufj, sstartsj[i]);
5886       for (j = 0; j < nrows; j++) {
5887         row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5888         for (ll = 0; ll < sbs; ll++) {
5889           PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5890           for (l = 0; l < ncols; l++) *bufJ++ = cols[l];
5891           PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5892         }
5893       }
5894       PetscCallMPI(MPIU_Isend(PetscSafePointerPlusOffset(bufj, sstartsj[i]), sstartsj[i + 1] - sstartsj[i], MPIU_INT, sprocs[i], tag, comm, swaits + i));
5895     }
5896 
5897     /* recvs and sends of j-array are completed */
5898     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5899   } else if (scall == MAT_REUSE_MATRIX) {
5900     sstartsj = *startsj_s;
5901     rstartsj = *startsj_r;
5902     bufa     = *bufa_ptr;
5903     PetscCall(MatSeqAIJGetArrayWrite(*B_oth, &b_otha));
5904   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not possess an object container");
5905 
5906   /* a-array */
5907   /*  post receives of a-array */
5908   for (i = 0; i < nrecvs; i++) {
5909     nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5910     PetscCallMPI(MPIU_Irecv(PetscSafePointerPlusOffset(b_otha, rstartsj[i]), nrows, MPIU_SCALAR, rprocs[i], tag, comm, rwaits + i));
5911   }
5912 
5913   /* pack the outgoing message a-array */
5914   if (nsends) k = sstarts[0];
5915   for (i = 0; i < nsends; i++) {
5916     nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5917     bufA  = PetscSafePointerPlusOffset(bufa, sstartsj[i]);
5918     for (j = 0; j < nrows; j++) {
5919       row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5920       for (ll = 0; ll < sbs; ll++) {
5921         PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5922         for (l = 0; l < ncols; l++) *bufA++ = vals[l];
5923         PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5924       }
5925     }
5926     PetscCallMPI(MPIU_Isend(PetscSafePointerPlusOffset(bufa, sstartsj[i]), sstartsj[i + 1] - sstartsj[i], MPIU_SCALAR, sprocs[i], tag, comm, swaits + i));
5927   }
5928   /* recvs and sends of a-array are completed */
5929   if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5930   PetscCall(PetscFree(reqs));
5931 
5932   if (scall == MAT_INITIAL_MATRIX) {
5933     Mat_SeqAIJ *b_oth;
5934 
5935     /* put together the new matrix */
5936     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, aBn, B->cmap->N, b_othi, b_othj, b_otha, B_oth));
5937 
5938     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5939     /* Since these are PETSc arrays, change flags to free them as necessary. */
5940     b_oth          = (Mat_SeqAIJ *)(*B_oth)->data;
5941     b_oth->free_a  = PETSC_TRUE;
5942     b_oth->free_ij = PETSC_TRUE;
5943     b_oth->nonew   = 0;
5944 
5945     PetscCall(PetscFree(bufj));
5946     if (!startsj_s || !bufa_ptr) {
5947       PetscCall(PetscFree2(sstartsj, rstartsj));
5948       PetscCall(PetscFree(bufa_ptr));
5949     } else {
5950       *startsj_s = sstartsj;
5951       *startsj_r = rstartsj;
5952       *bufa_ptr  = bufa;
5953     }
5954   } else if (scall == MAT_REUSE_MATRIX) {
5955     PetscCall(MatSeqAIJRestoreArrayWrite(*B_oth, &b_otha));
5956   }
5957 
5958   PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE, &nsends, &sstarts, &srow, &sprocs, &sbs));
5959   PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE, &nrecvs, &rstarts, NULL, &rprocs, &rbs));
5960   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, B, 0, 0));
5961   PetscFunctionReturn(PETSC_SUCCESS);
5962 }
5963 
5964 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCRL(Mat, MatType, MatReuse, Mat *);
5965 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJPERM(Mat, MatType, MatReuse, Mat *);
5966 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJSELL(Mat, MatType, MatReuse, Mat *);
5967 #if defined(PETSC_HAVE_MKL_SPARSE)
5968 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJMKL(Mat, MatType, MatReuse, Mat *);
5969 #endif
5970 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIBAIJ(Mat, MatType, MatReuse, Mat *);
5971 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISBAIJ(Mat, MatType, MatReuse, Mat *);
5972 #if defined(PETSC_HAVE_ELEMENTAL)
5973 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_Elemental(Mat, MatType, MatReuse, Mat *);
5974 #endif
5975 #if defined(PETSC_HAVE_SCALAPACK) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
5976 PETSC_INTERN PetscErrorCode MatConvert_AIJ_ScaLAPACK(Mat, MatType, MatReuse, Mat *);
5977 #endif
5978 #if defined(PETSC_HAVE_HYPRE)
5979 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
5980 #endif
5981 #if defined(PETSC_HAVE_CUDA)
5982 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
5983 #endif
5984 #if defined(PETSC_HAVE_HIP)
5985 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
5986 #endif
5987 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
5988 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat, MatType, MatReuse, Mat *);
5989 #endif
5990 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISELL(Mat, MatType, MatReuse, Mat *);
5991 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
5992 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_IS_XAIJ(Mat);
5993 
5994 /*
5995     Computes (B'*A')' since computing B*A directly is untenable
5996 
5997                n                       p                          p
5998         [             ]       [             ]         [                 ]
5999       m [      A      ]  *  n [       B     ]   =   m [         C       ]
6000         [             ]       [             ]         [                 ]
6001 
6002 */
6003 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIAIJ(Mat A, Mat B, Mat C)
6004 {
6005   Mat At, Bt, Ct;
6006 
6007   PetscFunctionBegin;
6008   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
6009   PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &Bt));
6010   PetscCall(MatMatMult(Bt, At, MAT_INITIAL_MATRIX, PETSC_CURRENT, &Ct));
6011   PetscCall(MatDestroy(&At));
6012   PetscCall(MatDestroy(&Bt));
6013   PetscCall(MatTransposeSetPrecursor(Ct, C));
6014   PetscCall(MatTranspose(Ct, MAT_REUSE_MATRIX, &C));
6015   PetscCall(MatDestroy(&Ct));
6016   PetscFunctionReturn(PETSC_SUCCESS);
6017 }
6018 
6019 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIAIJ(Mat A, Mat B, PetscReal fill, Mat C)
6020 {
6021   PetscBool cisdense;
6022 
6023   PetscFunctionBegin;
6024   PetscCheck(A->cmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "A->cmap->n %" PetscInt_FMT " != B->rmap->n %" PetscInt_FMT, A->cmap->n, B->rmap->n);
6025   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
6026   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6027   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, MATMPIDENSEHIP, ""));
6028   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
6029   PetscCall(MatSetUp(C));
6030 
6031   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIAIJ;
6032   PetscFunctionReturn(PETSC_SUCCESS);
6033 }
6034 
6035 static PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ_AB(Mat C)
6036 {
6037   Mat_Product *product = C->product;
6038   Mat          A = product->A, B = product->B;
6039 
6040   PetscFunctionBegin;
6041   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
6042              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
6043   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIAIJ;
6044   C->ops->productsymbolic = MatProductSymbolic_AB;
6045   PetscFunctionReturn(PETSC_SUCCESS);
6046 }
6047 
6048 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ(Mat C)
6049 {
6050   Mat_Product *product = C->product;
6051 
6052   PetscFunctionBegin;
6053   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_MPIDense_MPIAIJ_AB(C));
6054   PetscFunctionReturn(PETSC_SUCCESS);
6055 }
6056 
6057 /*
6058    Merge two sets of sorted nonzeros and return a CSR for the merged (sequential) matrix
6059 
6060   Input Parameters:
6061 
6062     j1,rowBegin1,rowEnd1,jmap1: describe the first set of nonzeros (Set1)
6063     j2,rowBegin2,rowEnd2,jmap2: describe the second set of nonzeros (Set2)
6064 
6065     mat: both sets' nonzeros are on m rows, where m is the number of local rows of the matrix mat
6066 
6067     For Set1, j1[] contains column indices of the nonzeros.
6068     For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k
6069     respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted,
6070     but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1.
6071 
6072     Similar for Set2.
6073 
6074     This routine merges the two sets of nonzeros row by row and removes repeats.
6075 
6076   Output Parameters: (memory is allocated by the caller)
6077 
6078     i[],j[]: the CSR of the merged matrix, which has m rows.
6079     imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix.
6080     imap2[]: similar to imap1[], but for Set2.
6081     Note we order nonzeros row-by-row and from left to right.
6082 */
6083 static PetscErrorCode MatMergeEntries_Internal(Mat mat, const PetscInt j1[], const PetscInt j2[], const PetscCount rowBegin1[], const PetscCount rowEnd1[], const PetscCount rowBegin2[], const PetscCount rowEnd2[], const PetscCount jmap1[], const PetscCount jmap2[], PetscCount imap1[], PetscCount imap2[], PetscInt i[], PetscInt j[])
6084 {
6085   PetscInt   r, m; /* Row index of mat */
6086   PetscCount t, t1, t2, b1, e1, b2, e2;
6087 
6088   PetscFunctionBegin;
6089   PetscCall(MatGetLocalSize(mat, &m, NULL));
6090   t1 = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */
6091   i[0]        = 0;
6092   for (r = 0; r < m; r++) { /* Do row by row merging */
6093     b1 = rowBegin1[r];
6094     e1 = rowEnd1[r];
6095     b2 = rowBegin2[r];
6096     e2 = rowEnd2[r];
6097     while (b1 < e1 && b2 < e2) {
6098       if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */
6099         j[t]      = j1[b1];
6100         imap1[t1] = t;
6101         imap2[t2] = t;
6102         b1 += jmap1[t1 + 1] - jmap1[t1]; /* Jump to next unique local nonzero */
6103         b2 += jmap2[t2 + 1] - jmap2[t2]; /* Jump to next unique remote nonzero */
6104         t1++;
6105         t2++;
6106         t++;
6107       } else if (j1[b1] < j2[b2]) {
6108         j[t]      = j1[b1];
6109         imap1[t1] = t;
6110         b1 += jmap1[t1 + 1] - jmap1[t1];
6111         t1++;
6112         t++;
6113       } else {
6114         j[t]      = j2[b2];
6115         imap2[t2] = t;
6116         b2 += jmap2[t2 + 1] - jmap2[t2];
6117         t2++;
6118         t++;
6119       }
6120     }
6121     /* Merge the remaining in either j1[] or j2[] */
6122     while (b1 < e1) {
6123       j[t]      = j1[b1];
6124       imap1[t1] = t;
6125       b1 += jmap1[t1 + 1] - jmap1[t1];
6126       t1++;
6127       t++;
6128     }
6129     while (b2 < e2) {
6130       j[t]      = j2[b2];
6131       imap2[t2] = t;
6132       b2 += jmap2[t2 + 1] - jmap2[t2];
6133       t2++;
6134       t++;
6135     }
6136     PetscCall(PetscIntCast(t, i + r + 1));
6137   }
6138   PetscFunctionReturn(PETSC_SUCCESS);
6139 }
6140 
6141 /*
6142   Split nonzeros in a block of local rows into two subsets: those in the diagonal block and those in the off-diagonal block
6143 
6144   Input Parameters:
6145     mat: an MPI matrix that provides row and column layout information for splitting. Let's say its number of local rows is m.
6146     n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[]
6147       respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n.
6148 
6149       i[] is already sorted, but within a row, j[] is not sorted and might have repeats.
6150       i[] might contain negative indices at the beginning, which means the corresponding entries should be ignored in the splitting.
6151 
6152   Output Parameters:
6153     j[],perm[]: the routine needs to sort j[] within each row along with perm[].
6154     rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller.
6155       They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries of row r of the diagonal block,
6156       and [rowMid[r],rowEnd[r]) point to begin/end entries of row r of the off-diagonal block.
6157 
6158     Aperm[],Ajmap[],Atot,Annz: Arrays are allocated by this routine.
6159       Atot: number of entries belonging to the diagonal block.
6160       Annz: number of unique nonzeros belonging to the diagonal block.
6161       Aperm[Atot] stores values from perm[] for entries belonging to the diagonal block. Length of Aperm[] is Atot, though it may also count
6162         repeats (i.e., same 'i,j' pair).
6163       Ajmap[Annz+1] stores the number of repeats of each unique entry belonging to the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t]
6164         is the number of repeats for the t-th unique entry in the diagonal block. Ajmap[0] is always 0.
6165 
6166       Atot: number of entries belonging to the diagonal block
6167       Annz: number of unique nonzeros belonging to the diagonal block.
6168 
6169     Bperm[], Bjmap[], Btot, Bnnz are similar but for the off-diagonal block.
6170 
6171     Aperm[],Bperm[],Ajmap[] and Bjmap[] are allocated separately by this routine with PetscMalloc1().
6172 */
6173 static PetscErrorCode MatSplitEntries_Internal(Mat mat, PetscCount n, const PetscInt i[], PetscInt j[], PetscCount perm[], PetscCount rowBegin[], PetscCount rowMid[], PetscCount rowEnd[], PetscCount *Atot_, PetscCount **Aperm_, PetscCount *Annz_, PetscCount **Ajmap_, PetscCount *Btot_, PetscCount **Bperm_, PetscCount *Bnnz_, PetscCount **Bjmap_)
6174 {
6175   PetscInt    cstart, cend, rstart, rend, row, col;
6176   PetscCount  Atot = 0, Btot = 0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */
6177   PetscCount  Annz = 0, Bnnz = 0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */
6178   PetscCount  k, m, p, q, r, s, mid;
6179   PetscCount *Aperm, *Bperm, *Ajmap, *Bjmap;
6180 
6181   PetscFunctionBegin;
6182   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6183   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6184   m = rend - rstart;
6185 
6186   /* Skip negative rows */
6187   for (k = 0; k < n; k++)
6188     if (i[k] >= 0) break;
6189 
6190   /* Process [k,n): sort and partition each local row into diag and offdiag portions,
6191      fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz.
6192   */
6193   while (k < n) {
6194     row = i[k];
6195     /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */
6196     for (s = k; s < n; s++)
6197       if (i[s] != row) break;
6198 
6199     /* Shift diag columns to range of [-PETSC_INT_MAX, -1] */
6200     for (p = k; p < s; p++) {
6201       if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_INT_MAX;
6202     }
6203     PetscCall(PetscSortIntWithCountArray(s - k, j + k, perm + k));
6204     PetscCall(PetscSortedIntUpperBound(j, k, s, -1, &mid)); /* Separate [k,s) into [k,mid) for diag and [mid,s) for offdiag */
6205     rowBegin[row - rstart] = k;
6206     rowMid[row - rstart]   = mid;
6207     rowEnd[row - rstart]   = s;
6208     PetscCheck(k == s || j[s - 1] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index %" PetscInt_FMT " is >= matrix column size %" PetscInt_FMT, j[s - 1], mat->cmap->N);
6209 
6210     /* Count nonzeros of this diag/offdiag row, which might have repeats */
6211     Atot += mid - k;
6212     Btot += s - mid;
6213 
6214     /* Count unique nonzeros of this diag row */
6215     for (p = k; p < mid;) {
6216       col = j[p];
6217       do {
6218         j[p] += PETSC_INT_MAX; /* Revert the modified diagonal indices */
6219         p++;
6220       } while (p < mid && j[p] == col);
6221       Annz++;
6222     }
6223 
6224     /* Count unique nonzeros of this offdiag row */
6225     for (p = mid; p < s;) {
6226       col = j[p];
6227       do {
6228         p++;
6229       } while (p < s && j[p] == col);
6230       Bnnz++;
6231     }
6232     k = s;
6233   }
6234 
6235   /* Allocation according to Atot, Btot, Annz, Bnnz */
6236   PetscCall(PetscMalloc1(Atot, &Aperm));
6237   PetscCall(PetscMalloc1(Btot, &Bperm));
6238   PetscCall(PetscMalloc1(Annz + 1, &Ajmap));
6239   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap));
6240 
6241   /* Re-scan indices and copy diag/offdiag permutation indices to Aperm, Bperm and also fill Ajmap and Bjmap */
6242   Ajmap[0] = Bjmap[0] = Atot = Btot = Annz = Bnnz = 0;
6243   for (r = 0; r < m; r++) {
6244     k   = rowBegin[r];
6245     mid = rowMid[r];
6246     s   = rowEnd[r];
6247     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Aperm, Atot), PetscSafePointerPlusOffset(perm, k), mid - k));
6248     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Bperm, Btot), PetscSafePointerPlusOffset(perm, mid), s - mid));
6249     Atot += mid - k;
6250     Btot += s - mid;
6251 
6252     /* Scan column indices in this row and find out how many repeats each unique nonzero has */
6253     for (p = k; p < mid;) {
6254       col = j[p];
6255       q   = p;
6256       do {
6257         p++;
6258       } while (p < mid && j[p] == col);
6259       Ajmap[Annz + 1] = Ajmap[Annz] + (p - q);
6260       Annz++;
6261     }
6262 
6263     for (p = mid; p < s;) {
6264       col = j[p];
6265       q   = p;
6266       do {
6267         p++;
6268       } while (p < s && j[p] == col);
6269       Bjmap[Bnnz + 1] = Bjmap[Bnnz] + (p - q);
6270       Bnnz++;
6271     }
6272   }
6273   /* Output */
6274   *Aperm_ = Aperm;
6275   *Annz_  = Annz;
6276   *Atot_  = Atot;
6277   *Ajmap_ = Ajmap;
6278   *Bperm_ = Bperm;
6279   *Bnnz_  = Bnnz;
6280   *Btot_  = Btot;
6281   *Bjmap_ = Bjmap;
6282   PetscFunctionReturn(PETSC_SUCCESS);
6283 }
6284 
6285 /*
6286   Expand the jmap[] array to make a new one in view of nonzeros in the merged matrix
6287 
6288   Input Parameters:
6289     nnz1: number of unique nonzeros in a set that was used to produce imap[], jmap[]
6290     nnz:  number of unique nonzeros in the merged matrix
6291     imap[nnz1]: i-th nonzero in the set is the imap[i]-th nonzero in the merged matrix
6292     jmap[nnz1+1]: i-th nonzero in the set has jmap[i+1] - jmap[i] repeats in the set
6293 
6294   Output Parameter: (memory is allocated by the caller)
6295     jmap_new[nnz+1]: i-th nonzero in the merged matrix has jmap_new[i+1] - jmap_new[i] repeats in the set
6296 
6297   Example:
6298     nnz1 = 4
6299     nnz  = 6
6300     imap = [1,3,4,5]
6301     jmap = [0,3,5,6,7]
6302    then,
6303     jmap_new = [0,0,3,3,5,6,7]
6304 */
6305 static PetscErrorCode ExpandJmap_Internal(PetscCount nnz1, PetscCount nnz, const PetscCount imap[], const PetscCount jmap[], PetscCount jmap_new[])
6306 {
6307   PetscCount k, p;
6308 
6309   PetscFunctionBegin;
6310   jmap_new[0] = 0;
6311   p           = nnz;                /* p loops over jmap_new[] backwards */
6312   for (k = nnz1 - 1; k >= 0; k--) { /* k loops over imap[] */
6313     for (; p > imap[k]; p--) jmap_new[p] = jmap[k + 1];
6314   }
6315   for (; p >= 0; p--) jmap_new[p] = jmap[0];
6316   PetscFunctionReturn(PETSC_SUCCESS);
6317 }
6318 
6319 static PetscErrorCode MatCOOStructDestroy_MPIAIJ(void **data)
6320 {
6321   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;
6322 
6323   PetscFunctionBegin;
6324   PetscCall(PetscSFDestroy(&coo->sf));
6325   PetscCall(PetscFree(coo->Aperm1));
6326   PetscCall(PetscFree(coo->Bperm1));
6327   PetscCall(PetscFree(coo->Ajmap1));
6328   PetscCall(PetscFree(coo->Bjmap1));
6329   PetscCall(PetscFree(coo->Aimap2));
6330   PetscCall(PetscFree(coo->Bimap2));
6331   PetscCall(PetscFree(coo->Aperm2));
6332   PetscCall(PetscFree(coo->Bperm2));
6333   PetscCall(PetscFree(coo->Ajmap2));
6334   PetscCall(PetscFree(coo->Bjmap2));
6335   PetscCall(PetscFree(coo->Cperm1));
6336   PetscCall(PetscFree2(coo->sendbuf, coo->recvbuf));
6337   PetscCall(PetscFree(coo));
6338   PetscFunctionReturn(PETSC_SUCCESS);
6339 }
6340 
6341 PetscErrorCode MatSetPreallocationCOO_MPIAIJ(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
6342 {
6343   MPI_Comm             comm;
6344   PetscMPIInt          rank, size;
6345   PetscInt             m, n, M, N, rstart, rend, cstart, cend; /* Sizes, indices of row/col, therefore with type PetscInt */
6346   PetscCount           k, p, q, rem;                           /* Loop variables over coo arrays */
6347   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6348   PetscContainer       container;
6349   MatCOOStruct_MPIAIJ *coo;
6350 
6351   PetscFunctionBegin;
6352   PetscCall(PetscFree(mpiaij->garray));
6353   PetscCall(VecDestroy(&mpiaij->lvec));
6354 #if defined(PETSC_USE_CTABLE)
6355   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
6356 #else
6357   PetscCall(PetscFree(mpiaij->colmap));
6358 #endif
6359   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
6360   mat->assembled     = PETSC_FALSE;
6361   mat->was_assembled = PETSC_FALSE;
6362 
6363   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
6364   PetscCallMPI(MPI_Comm_size(comm, &size));
6365   PetscCallMPI(MPI_Comm_rank(comm, &rank));
6366   PetscCall(PetscLayoutSetUp(mat->rmap));
6367   PetscCall(PetscLayoutSetUp(mat->cmap));
6368   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6369   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6370   PetscCall(MatGetLocalSize(mat, &m, &n));
6371   PetscCall(MatGetSize(mat, &M, &N));
6372 
6373   /* Sort (i,j) by row along with a permutation array, so that the to-be-ignored */
6374   /* entries come first, then local rows, then remote rows.                     */
6375   PetscCount n1 = coo_n, *perm1;
6376   PetscInt  *i1 = coo_i, *j1 = coo_j;
6377 
6378   PetscCall(PetscMalloc1(n1, &perm1));
6379   for (k = 0; k < n1; k++) perm1[k] = k;
6380 
6381   /* Manipulate indices so that entries with negative row or col indices will have smallest
6382      row indices, local entries will have greater but negative row indices, and remote entries
6383      will have positive row indices.
6384   */
6385   for (k = 0; k < n1; k++) {
6386     if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_INT_MIN;                /* e.g., -2^31, minimal to move them ahead */
6387     else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_INT_MAX; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_INT_MAX, -1] */
6388     else {
6389       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "MAT_NO_OFF_PROC_ENTRIES is set but insert to remote rows");
6390       if (mpiaij->donotstash) i1[k] = PETSC_INT_MIN; /* Ignore offproc entries as if they had negative indices */
6391     }
6392   }
6393 
6394   /* Sort by row; after that, [0,k) have ignored entries, [k,rem) have local rows and [rem,n1) have remote rows */
6395   PetscCall(PetscSortIntWithIntCountArrayPair(n1, i1, j1, perm1));
6396 
6397   /* Advance k to the first entry we need to take care of */
6398   for (k = 0; k < n1; k++)
6399     if (i1[k] > PETSC_INT_MIN) break;
6400   PetscCount i1start = k;
6401 
6402   PetscCall(PetscSortedIntUpperBound(i1, k, n1, rend - 1 - PETSC_INT_MAX, &rem)); /* rem is upper bound of the last local row */
6403   for (; k < rem; k++) i1[k] += PETSC_INT_MAX;                                    /* Revert row indices of local rows*/
6404 
6405   PetscCheck(n1 == 0 || i1[n1 - 1] < M, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "COO row index %" PetscInt_FMT " is >= the matrix row size %" PetscInt_FMT, i1[n1 - 1], M);
6406 
6407   /*           Send remote rows to their owner                                  */
6408   /* Find which rows should be sent to which remote ranks*/
6409   PetscInt        nsend = 0; /* Number of MPI ranks to send data to */
6410   PetscMPIInt    *sendto;    /* [nsend], storing remote ranks */
6411   PetscInt       *nentries;  /* [nsend], storing number of entries sent to remote ranks; Assume PetscInt is big enough for this count, and error if not */
6412   const PetscInt *ranges;
6413   PetscInt        maxNsend = size >= 128 ? 128 : size; /* Assume max 128 neighbors; realloc when needed */
6414 
6415   PetscCall(PetscLayoutGetRanges(mat->rmap, &ranges));
6416   PetscCall(PetscMalloc2(maxNsend, &sendto, maxNsend, &nentries));
6417   for (k = rem; k < n1;) {
6418     PetscMPIInt owner;
6419     PetscInt    firstRow, lastRow;
6420 
6421     /* Locate a row range */
6422     firstRow = i1[k]; /* first row of this owner */
6423     PetscCall(PetscLayoutFindOwner(mat->rmap, firstRow, &owner));
6424     lastRow = ranges[owner + 1] - 1; /* last row of this owner */
6425 
6426     /* Find the first index 'p' in [k,n) with i1[p] belonging to next owner */
6427     PetscCall(PetscSortedIntUpperBound(i1, k, n1, lastRow, &p));
6428 
6429     /* All entries in [k,p) belong to this remote owner */
6430     if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */
6431       PetscMPIInt *sendto2;
6432       PetscInt    *nentries2;
6433       PetscInt     maxNsend2 = (maxNsend <= size / 2) ? maxNsend * 2 : size;
6434 
6435       PetscCall(PetscMalloc2(maxNsend2, &sendto2, maxNsend2, &nentries2));
6436       PetscCall(PetscArraycpy(sendto2, sendto, maxNsend));
6437       PetscCall(PetscArraycpy(nentries2, nentries2, maxNsend + 1));
6438       PetscCall(PetscFree2(sendto, nentries2));
6439       sendto   = sendto2;
6440       nentries = nentries2;
6441       maxNsend = maxNsend2;
6442     }
6443     sendto[nsend] = owner;
6444     PetscCall(PetscIntCast(p - k, &nentries[nsend]));
6445     nsend++;
6446     k = p;
6447   }
6448 
6449   /* Build 1st SF to know offsets on remote to send data */
6450   PetscSF      sf1;
6451   PetscInt     nroots = 1, nroots2 = 0;
6452   PetscInt     nleaves = nsend, nleaves2 = 0;
6453   PetscInt    *offsets;
6454   PetscSFNode *iremote;
6455 
6456   PetscCall(PetscSFCreate(comm, &sf1));
6457   PetscCall(PetscMalloc1(nsend, &iremote));
6458   PetscCall(PetscMalloc1(nsend, &offsets));
6459   for (k = 0; k < nsend; k++) {
6460     iremote[k].rank  = sendto[k];
6461     iremote[k].index = 0;
6462     nleaves2 += nentries[k];
6463     PetscCheck(nleaves2 >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF leaves is too large for PetscInt");
6464   }
6465   PetscCall(PetscSFSetGraph(sf1, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6466   PetscCall(PetscSFFetchAndOpWithMemTypeBegin(sf1, MPIU_INT, PETSC_MEMTYPE_HOST, &nroots2 /*rootdata*/, PETSC_MEMTYPE_HOST, nentries /*leafdata*/, PETSC_MEMTYPE_HOST, offsets /*leafupdate*/, MPI_SUM));
6467   PetscCall(PetscSFFetchAndOpEnd(sf1, MPIU_INT, &nroots2, nentries, offsets, MPI_SUM)); /* Would nroots2 overflow, we check offsets[] below */
6468   PetscCall(PetscSFDestroy(&sf1));
6469   PetscAssert(nleaves2 == n1 - rem, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nleaves2 %" PetscInt_FMT " != number of remote entries %" PetscCount_FMT, nleaves2, n1 - rem);
6470 
6471   /* Build 2nd SF to send remote COOs to their owner */
6472   PetscSF sf2;
6473   nroots  = nroots2;
6474   nleaves = nleaves2;
6475   PetscCall(PetscSFCreate(comm, &sf2));
6476   PetscCall(PetscSFSetFromOptions(sf2));
6477   PetscCall(PetscMalloc1(nleaves, &iremote));
6478   p = 0;
6479   for (k = 0; k < nsend; k++) {
6480     PetscCheck(offsets[k] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF roots is too large for PetscInt");
6481     for (q = 0; q < nentries[k]; q++, p++) {
6482       iremote[p].rank = sendto[k];
6483       PetscCall(PetscIntCast(offsets[k] + q, &iremote[p].index));
6484     }
6485   }
6486   PetscCall(PetscSFSetGraph(sf2, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6487 
6488   /* Send the remote COOs to their owner */
6489   PetscInt    n2 = nroots, *i2, *j2; /* Buffers for received COOs from other ranks, along with a permutation array */
6490   PetscCount *perm2;                 /* Though PetscInt is enough for remote entries, we use PetscCount here as we want to reuse MatSplitEntries_Internal() */
6491   PetscCall(PetscMalloc3(n2, &i2, n2, &j2, n2, &perm2));
6492   PetscAssert(rem == 0 || i1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6493   PetscAssert(rem == 0 || j1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6494   PetscInt *i1prem = PetscSafePointerPlusOffset(i1, rem);
6495   PetscInt *j1prem = PetscSafePointerPlusOffset(j1, rem);
6496   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, i1prem, PETSC_MEMTYPE_HOST, i2, MPI_REPLACE));
6497   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, i1prem, i2, MPI_REPLACE));
6498   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, j1prem, PETSC_MEMTYPE_HOST, j2, MPI_REPLACE));
6499   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, j1prem, j2, MPI_REPLACE));
6500 
6501   PetscCall(PetscFree(offsets));
6502   PetscCall(PetscFree2(sendto, nentries));
6503 
6504   /* Sort received COOs by row along with the permutation array     */
6505   for (k = 0; k < n2; k++) perm2[k] = k;
6506   PetscCall(PetscSortIntWithIntCountArrayPair(n2, i2, j2, perm2));
6507 
6508   /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permutation which will be used to fill leafdata */
6509   PetscCount *Cperm1;
6510   PetscAssert(rem == 0 || perm1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6511   PetscCount *perm1prem = PetscSafePointerPlusOffset(perm1, rem);
6512   PetscCall(PetscMalloc1(nleaves, &Cperm1));
6513   PetscCall(PetscArraycpy(Cperm1, perm1prem, nleaves));
6514 
6515   /* Support for HYPRE matrices, kind of a hack.
6516      Swap min column with diagonal so that diagonal values will go first */
6517   PetscBool hypre;
6518   PetscCall(PetscStrcmp("_internal_COO_mat_for_hypre", ((PetscObject)mat)->name, &hypre));
6519   if (hypre) {
6520     PetscInt *minj;
6521     PetscBT   hasdiag;
6522 
6523     PetscCall(PetscBTCreate(m, &hasdiag));
6524     PetscCall(PetscMalloc1(m, &minj));
6525     for (k = 0; k < m; k++) minj[k] = PETSC_INT_MAX;
6526     for (k = i1start; k < rem; k++) {
6527       if (j1[k] < cstart || j1[k] >= cend) continue;
6528       const PetscInt rindex = i1[k] - rstart;
6529       if ((j1[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6530       minj[rindex] = PetscMin(minj[rindex], j1[k]);
6531     }
6532     for (k = 0; k < n2; k++) {
6533       if (j2[k] < cstart || j2[k] >= cend) continue;
6534       const PetscInt rindex = i2[k] - rstart;
6535       if ((j2[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6536       minj[rindex] = PetscMin(minj[rindex], j2[k]);
6537     }
6538     for (k = i1start; k < rem; k++) {
6539       const PetscInt rindex = i1[k] - rstart;
6540       if (j1[k] < cstart || j1[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6541       if (j1[k] == minj[rindex]) j1[k] = i1[k] + (cstart - rstart);
6542       else if ((j1[k] - cstart) == rindex) j1[k] = minj[rindex];
6543     }
6544     for (k = 0; k < n2; k++) {
6545       const PetscInt rindex = i2[k] - rstart;
6546       if (j2[k] < cstart || j2[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6547       if (j2[k] == minj[rindex]) j2[k] = i2[k] + (cstart - rstart);
6548       else if ((j2[k] - cstart) == rindex) j2[k] = minj[rindex];
6549     }
6550     PetscCall(PetscBTDestroy(&hasdiag));
6551     PetscCall(PetscFree(minj));
6552   }
6553 
6554   /* Split local COOs and received COOs into diag/offdiag portions */
6555   PetscCount *rowBegin1, *rowMid1, *rowEnd1;
6556   PetscCount *Ajmap1, *Aperm1, *Bjmap1, *Bperm1;
6557   PetscCount  Annz1, Bnnz1, Atot1, Btot1;
6558   PetscCount *rowBegin2, *rowMid2, *rowEnd2;
6559   PetscCount *Ajmap2, *Aperm2, *Bjmap2, *Bperm2;
6560   PetscCount  Annz2, Bnnz2, Atot2, Btot2;
6561 
6562   PetscCall(PetscCalloc3(m, &rowBegin1, m, &rowMid1, m, &rowEnd1));
6563   PetscCall(PetscCalloc3(m, &rowBegin2, m, &rowMid2, m, &rowEnd2));
6564   PetscCall(MatSplitEntries_Internal(mat, rem, i1, j1, perm1, rowBegin1, rowMid1, rowEnd1, &Atot1, &Aperm1, &Annz1, &Ajmap1, &Btot1, &Bperm1, &Bnnz1, &Bjmap1));
6565   PetscCall(MatSplitEntries_Internal(mat, n2, i2, j2, perm2, rowBegin2, rowMid2, rowEnd2, &Atot2, &Aperm2, &Annz2, &Ajmap2, &Btot2, &Bperm2, &Bnnz2, &Bjmap2));
6566 
6567   /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */
6568   PetscInt *Ai, *Bi;
6569   PetscInt *Aj, *Bj;
6570 
6571   PetscCall(PetscMalloc1(m + 1, &Ai));
6572   PetscCall(PetscMalloc1(m + 1, &Bi));
6573   PetscCall(PetscMalloc1(Annz1 + Annz2, &Aj)); /* Since local and remote entries might have dups, we might allocate excess memory */
6574   PetscCall(PetscMalloc1(Bnnz1 + Bnnz2, &Bj));
6575 
6576   PetscCount *Aimap1, *Bimap1, *Aimap2, *Bimap2;
6577   PetscCall(PetscMalloc1(Annz1, &Aimap1));
6578   PetscCall(PetscMalloc1(Bnnz1, &Bimap1));
6579   PetscCall(PetscMalloc1(Annz2, &Aimap2));
6580   PetscCall(PetscMalloc1(Bnnz2, &Bimap2));
6581 
6582   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowBegin1, rowMid1, rowBegin2, rowMid2, Ajmap1, Ajmap2, Aimap1, Aimap2, Ai, Aj));
6583   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowMid1, rowEnd1, rowMid2, rowEnd2, Bjmap1, Bjmap2, Bimap1, Bimap2, Bi, Bj));
6584 
6585   /* Expand Ajmap1/Bjmap1 to make them based off nonzeros in A/B, since we     */
6586   /* expect nonzeros in A/B most likely have local contributing entries        */
6587   PetscInt    Annz = Ai[m];
6588   PetscInt    Bnnz = Bi[m];
6589   PetscCount *Ajmap1_new, *Bjmap1_new;
6590 
6591   PetscCall(PetscMalloc1(Annz + 1, &Ajmap1_new));
6592   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap1_new));
6593 
6594   PetscCall(ExpandJmap_Internal(Annz1, Annz, Aimap1, Ajmap1, Ajmap1_new));
6595   PetscCall(ExpandJmap_Internal(Bnnz1, Bnnz, Bimap1, Bjmap1, Bjmap1_new));
6596 
6597   PetscCall(PetscFree(Aimap1));
6598   PetscCall(PetscFree(Ajmap1));
6599   PetscCall(PetscFree(Bimap1));
6600   PetscCall(PetscFree(Bjmap1));
6601   PetscCall(PetscFree3(rowBegin1, rowMid1, rowEnd1));
6602   PetscCall(PetscFree3(rowBegin2, rowMid2, rowEnd2));
6603   PetscCall(PetscFree(perm1));
6604   PetscCall(PetscFree3(i2, j2, perm2));
6605 
6606   Ajmap1 = Ajmap1_new;
6607   Bjmap1 = Bjmap1_new;
6608 
6609   /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */
6610   if (Annz < Annz1 + Annz2) {
6611     PetscInt *Aj_new;
6612     PetscCall(PetscMalloc1(Annz, &Aj_new));
6613     PetscCall(PetscArraycpy(Aj_new, Aj, Annz));
6614     PetscCall(PetscFree(Aj));
6615     Aj = Aj_new;
6616   }
6617 
6618   if (Bnnz < Bnnz1 + Bnnz2) {
6619     PetscInt *Bj_new;
6620     PetscCall(PetscMalloc1(Bnnz, &Bj_new));
6621     PetscCall(PetscArraycpy(Bj_new, Bj, Bnnz));
6622     PetscCall(PetscFree(Bj));
6623     Bj = Bj_new;
6624   }
6625 
6626   /* Create new submatrices for on-process and off-process coupling                  */
6627   PetscScalar     *Aa, *Ba;
6628   MatType          rtype;
6629   Mat_SeqAIJ      *a, *b;
6630   PetscObjectState state;
6631   PetscCall(PetscCalloc1(Annz, &Aa)); /* Zero matrix on device */
6632   PetscCall(PetscCalloc1(Bnnz, &Ba));
6633   /* make Aj[] local, i.e, based off the start column of the diagonal portion */
6634   if (cstart) {
6635     for (k = 0; k < Annz; k++) Aj[k] -= cstart;
6636   }
6637 
6638   PetscCall(MatGetRootType_Private(mat, &rtype));
6639 
6640   MatSeqXAIJGetOptions_Private(mpiaij->A);
6641   PetscCall(MatDestroy(&mpiaij->A));
6642   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, Ai, Aj, Aa, &mpiaij->A));
6643   PetscCall(MatSetBlockSizesFromMats(mpiaij->A, mat, mat));
6644   MatSeqXAIJRestoreOptions_Private(mpiaij->A);
6645 
6646   MatSeqXAIJGetOptions_Private(mpiaij->B);
6647   PetscCall(MatDestroy(&mpiaij->B));
6648   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, mat->cmap->N, Bi, Bj, Ba, &mpiaij->B));
6649   PetscCall(MatSetBlockSizesFromMats(mpiaij->B, mat, mat));
6650   MatSeqXAIJRestoreOptions_Private(mpiaij->B);
6651 
6652   PetscCall(MatSetUpMultiply_MPIAIJ(mat));
6653   mat->was_assembled = PETSC_TRUE; // was_assembled in effect means the Mvctx is built; doing so avoids redundant MatSetUpMultiply_MPIAIJ
6654   state              = mpiaij->A->nonzerostate + mpiaij->B->nonzerostate;
6655   PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
6656 
6657   a          = (Mat_SeqAIJ *)mpiaij->A->data;
6658   b          = (Mat_SeqAIJ *)mpiaij->B->data;
6659   a->free_a  = PETSC_TRUE;
6660   a->free_ij = PETSC_TRUE;
6661   b->free_a  = PETSC_TRUE;
6662   b->free_ij = PETSC_TRUE;
6663   a->maxnz   = a->nz;
6664   b->maxnz   = b->nz;
6665 
6666   /* conversion must happen AFTER multiply setup */
6667   PetscCall(MatConvert(mpiaij->A, rtype, MAT_INPLACE_MATRIX, &mpiaij->A));
6668   PetscCall(MatConvert(mpiaij->B, rtype, MAT_INPLACE_MATRIX, &mpiaij->B));
6669   PetscCall(VecDestroy(&mpiaij->lvec));
6670   PetscCall(MatCreateVecs(mpiaij->B, &mpiaij->lvec, NULL));
6671 
6672   // Put the COO struct in a container and then attach that to the matrix
6673   PetscCall(PetscMalloc1(1, &coo));
6674   coo->n       = coo_n;
6675   coo->sf      = sf2;
6676   coo->sendlen = nleaves;
6677   coo->recvlen = nroots;
6678   coo->Annz    = Annz;
6679   coo->Bnnz    = Bnnz;
6680   coo->Annz2   = Annz2;
6681   coo->Bnnz2   = Bnnz2;
6682   coo->Atot1   = Atot1;
6683   coo->Atot2   = Atot2;
6684   coo->Btot1   = Btot1;
6685   coo->Btot2   = Btot2;
6686   coo->Ajmap1  = Ajmap1;
6687   coo->Aperm1  = Aperm1;
6688   coo->Bjmap1  = Bjmap1;
6689   coo->Bperm1  = Bperm1;
6690   coo->Aimap2  = Aimap2;
6691   coo->Ajmap2  = Ajmap2;
6692   coo->Aperm2  = Aperm2;
6693   coo->Bimap2  = Bimap2;
6694   coo->Bjmap2  = Bjmap2;
6695   coo->Bperm2  = Bperm2;
6696   coo->Cperm1  = Cperm1;
6697   // Allocate in preallocation. If not used, it has zero cost on host
6698   PetscCall(PetscMalloc2(coo->sendlen, &coo->sendbuf, coo->recvlen, &coo->recvbuf));
6699   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
6700   PetscCall(PetscContainerSetPointer(container, coo));
6701   PetscCall(PetscContainerSetCtxDestroy(container, MatCOOStructDestroy_MPIAIJ));
6702   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject)container));
6703   PetscCall(PetscContainerDestroy(&container));
6704   PetscFunctionReturn(PETSC_SUCCESS);
6705 }
6706 
6707 static PetscErrorCode MatSetValuesCOO_MPIAIJ(Mat mat, const PetscScalar v[], InsertMode imode)
6708 {
6709   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6710   Mat                  A = mpiaij->A, B = mpiaij->B;
6711   PetscScalar         *Aa, *Ba;
6712   PetscScalar         *sendbuf, *recvbuf;
6713   const PetscCount    *Ajmap1, *Ajmap2, *Aimap2;
6714   const PetscCount    *Bjmap1, *Bjmap2, *Bimap2;
6715   const PetscCount    *Aperm1, *Aperm2, *Bperm1, *Bperm2;
6716   const PetscCount    *Cperm1;
6717   PetscContainer       container;
6718   MatCOOStruct_MPIAIJ *coo;
6719 
6720   PetscFunctionBegin;
6721   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container));
6722   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
6723   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
6724   sendbuf = coo->sendbuf;
6725   recvbuf = coo->recvbuf;
6726   Ajmap1  = coo->Ajmap1;
6727   Ajmap2  = coo->Ajmap2;
6728   Aimap2  = coo->Aimap2;
6729   Bjmap1  = coo->Bjmap1;
6730   Bjmap2  = coo->Bjmap2;
6731   Bimap2  = coo->Bimap2;
6732   Aperm1  = coo->Aperm1;
6733   Aperm2  = coo->Aperm2;
6734   Bperm1  = coo->Bperm1;
6735   Bperm2  = coo->Bperm2;
6736   Cperm1  = coo->Cperm1;
6737 
6738   PetscCall(MatSeqAIJGetArray(A, &Aa)); /* Might read and write matrix values */
6739   PetscCall(MatSeqAIJGetArray(B, &Ba));
6740 
6741   /* Pack entries to be sent to remote */
6742   for (PetscCount i = 0; i < coo->sendlen; i++) sendbuf[i] = v[Cperm1[i]];
6743 
6744   /* Send remote entries to their owner and overlap the communication with local computation */
6745   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HOST, sendbuf, PETSC_MEMTYPE_HOST, recvbuf, MPI_REPLACE));
6746   /* Add local entries to A and B */
6747   for (PetscCount i = 0; i < coo->Annz; i++) { /* All nonzeros in A are either zero'ed or added with a value (i.e., initialized) */
6748     PetscScalar sum = 0.0;                     /* Do partial summation first to improve numerical stability */
6749     for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += v[Aperm1[k]];
6750     Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
6751   }
6752   for (PetscCount i = 0; i < coo->Bnnz; i++) {
6753     PetscScalar sum = 0.0;
6754     for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += v[Bperm1[k]];
6755     Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
6756   }
6757   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE));
6758 
6759   /* Add received remote entries to A and B */
6760   for (PetscCount i = 0; i < coo->Annz2; i++) {
6761     for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += recvbuf[Aperm2[k]];
6762   }
6763   for (PetscCount i = 0; i < coo->Bnnz2; i++) {
6764     for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += recvbuf[Bperm2[k]];
6765   }
6766   PetscCall(MatSeqAIJRestoreArray(A, &Aa));
6767   PetscCall(MatSeqAIJRestoreArray(B, &Ba));
6768   PetscFunctionReturn(PETSC_SUCCESS);
6769 }
6770 
6771 /*MC
6772    MATMPIAIJ - MATMPIAIJ = "mpiaij" - A matrix type to be used for parallel sparse matrices.
6773 
6774    Options Database Keys:
6775 . -mat_type mpiaij - sets the matrix type to `MATMPIAIJ` during a call to `MatSetFromOptions()`
6776 
6777    Level: beginner
6778 
6779    Notes:
6780    `MatSetValues()` may be called for this matrix type with a `NULL` argument for the numerical values,
6781     in this case the values associated with the rows and columns one passes in are set to zero
6782     in the matrix
6783 
6784     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
6785     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
6786 
6787 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJ`, `MATAIJ`, `MatCreateAIJ()`
6788 M*/
6789 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJ(Mat B)
6790 {
6791   Mat_MPIAIJ *b;
6792   PetscMPIInt size;
6793 
6794   PetscFunctionBegin;
6795   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
6796 
6797   PetscCall(PetscNew(&b));
6798   B->data       = (void *)b;
6799   B->ops[0]     = MatOps_Values;
6800   B->assembled  = PETSC_FALSE;
6801   B->insertmode = NOT_SET_VALUES;
6802   b->size       = size;
6803 
6804   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &b->rank));
6805 
6806   /* build cache for off array entries formed */
6807   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)B), 1, &B->stash));
6808 
6809   b->donotstash  = PETSC_FALSE;
6810   b->colmap      = NULL;
6811   b->garray      = NULL;
6812   b->roworiented = PETSC_TRUE;
6813 
6814   /* stuff used for matrix vector multiply */
6815   b->lvec  = NULL;
6816   b->Mvctx = NULL;
6817 
6818   /* stuff for MatGetRow() */
6819   b->rowindices   = NULL;
6820   b->rowvalues    = NULL;
6821   b->getrowactive = PETSC_FALSE;
6822 
6823   /* flexible pointer used in CUSPARSE classes */
6824   b->spptr = NULL;
6825 
6826   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetUseScalableIncreaseOverlap_C", MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ));
6827   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_MPIAIJ));
6828   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_MPIAIJ));
6829   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_MPIAIJ));
6830   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJ));
6831   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetPreallocation_C", MatResetPreallocation_MPIAIJ));
6832   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetHash_C", MatResetHash_MPIAIJ));
6833   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocationCSR_C", MatMPIAIJSetPreallocationCSR_MPIAIJ));
6834   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDiagonalScaleLocal_C", MatDiagonalScaleLocal_MPIAIJ));
6835   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijperm_C", MatConvert_MPIAIJ_MPIAIJPERM));
6836   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijsell_C", MatConvert_MPIAIJ_MPIAIJSELL));
6837 #if defined(PETSC_HAVE_CUDA)
6838   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcusparse_C", MatConvert_MPIAIJ_MPIAIJCUSPARSE));
6839 #endif
6840 #if defined(PETSC_HAVE_HIP)
6841   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijhipsparse_C", MatConvert_MPIAIJ_MPIAIJHIPSPARSE));
6842 #endif
6843 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6844   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijkokkos_C", MatConvert_MPIAIJ_MPIAIJKokkos));
6845 #endif
6846 #if defined(PETSC_HAVE_MKL_SPARSE)
6847   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijmkl_C", MatConvert_MPIAIJ_MPIAIJMKL));
6848 #endif
6849   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcrl_C", MatConvert_MPIAIJ_MPIAIJCRL));
6850   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpibaij_C", MatConvert_MPIAIJ_MPIBAIJ));
6851   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisbaij_C", MatConvert_MPIAIJ_MPISBAIJ));
6852   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
6853 #if defined(PETSC_HAVE_ELEMENTAL)
6854   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_elemental_C", MatConvert_MPIAIJ_Elemental));
6855 #endif
6856 #if defined(PETSC_HAVE_SCALAPACK) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
6857   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_scalapack_C", MatConvert_AIJ_ScaLAPACK));
6858 #endif
6859   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_is_C", MatConvert_XAIJ_IS));
6860   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisell_C", MatConvert_MPIAIJ_MPISELL));
6861 #if defined(PETSC_HAVE_HYPRE)
6862   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_hypre_C", MatConvert_AIJ_HYPRE));
6863   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", MatProductSetFromOptions_Transpose_AIJ_AIJ));
6864 #endif
6865   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_is_mpiaij_C", MatProductSetFromOptions_IS_XAIJ));
6866   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_mpiaij_mpiaij_C", MatProductSetFromOptions_MPIAIJ));
6867   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJ));
6868   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJ));
6869   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJ));
6870   PetscFunctionReturn(PETSC_SUCCESS);
6871 }
6872 
6873 /*@
6874   MatCreateMPIAIJWithSplitArrays - creates a `MATMPIAIJ` matrix using arrays that contain the "diagonal"
6875   and "off-diagonal" part of the matrix in CSR format.
6876 
6877   Collective
6878 
6879   Input Parameters:
6880 + comm - MPI communicator
6881 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
6882 . n    - This value should be the same as the local size used in creating the
6883          x vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
6884          calculated if `N` is given) For square matrices `n` is almost always `m`.
6885 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
6886 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
6887 . i    - row indices for "diagonal" portion of matrix; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
6888 . j    - column indices, which must be local, i.e., based off the start column of the diagonal portion
6889 . a    - matrix values
6890 . oi   - row indices for "off-diagonal" portion of matrix; that is oi[0] = 0, oi[row] = oi[row-1] + number of elements in that row of the matrix
6891 . oj   - column indices, which must be global, representing global columns in the `MATMPIAIJ` matrix
6892 - oa   - matrix values
6893 
6894   Output Parameter:
6895 . mat - the matrix
6896 
6897   Level: advanced
6898 
6899   Notes:
6900   The `i`, `j`, and `a` arrays ARE NOT copied by this routine into the internal format used by PETSc (even in Fortran). The user
6901   must free the arrays once the matrix has been destroyed and not before.
6902 
6903   The `i` and `j` indices are 0 based
6904 
6905   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix
6906 
6907   This sets local rows and cannot be used to set off-processor values.
6908 
6909   Use of this routine is discouraged because it is inflexible and cumbersome to use. It is extremely rare that a
6910   legacy application natively assembles into exactly this split format. The code to do so is nontrivial and does
6911   not easily support in-place reassembly. It is recommended to use MatSetValues() (or a variant thereof) because
6912   the resulting assembly is easier to implement, will work with any matrix format, and the user does not have to
6913   keep track of the underlying array. Use `MatSetOption`(A,`MAT_NO_OFF_PROC_ENTRIES`,`PETSC_TRUE`) to disable all
6914   communication if it is known that only local entries will be set.
6915 
6916 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
6917           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithArrays()`
6918 @*/
6919 PetscErrorCode MatCreateMPIAIJWithSplitArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt i[], PetscInt j[], PetscScalar a[], PetscInt oi[], PetscInt oj[], PetscScalar oa[], Mat *mat)
6920 {
6921   Mat_MPIAIJ *maij;
6922 
6923   PetscFunctionBegin;
6924   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
6925   PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
6926   PetscCheck(oi[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "oi (row indices) must start with 0");
6927   PetscCall(MatCreate(comm, mat));
6928   PetscCall(MatSetSizes(*mat, m, n, M, N));
6929   PetscCall(MatSetType(*mat, MATMPIAIJ));
6930   maij = (Mat_MPIAIJ *)(*mat)->data;
6931 
6932   (*mat)->preallocated = PETSC_TRUE;
6933 
6934   PetscCall(PetscLayoutSetUp((*mat)->rmap));
6935   PetscCall(PetscLayoutSetUp((*mat)->cmap));
6936 
6937   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, i, j, a, &maij->A));
6938   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, (*mat)->cmap->N, oi, oj, oa, &maij->B));
6939 
6940   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
6941   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
6942   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
6943   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
6944   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
6945   PetscFunctionReturn(PETSC_SUCCESS);
6946 }
6947 
6948 typedef struct {
6949   Mat       *mp;    /* intermediate products */
6950   PetscBool *mptmp; /* is the intermediate product temporary ? */
6951   PetscInt   cp;    /* number of intermediate products */
6952 
6953   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
6954   PetscInt    *startsj_s, *startsj_r;
6955   PetscScalar *bufa;
6956   Mat          P_oth;
6957 
6958   /* may take advantage of merging product->B */
6959   Mat Bloc; /* B-local by merging diag and off-diag */
6960 
6961   /* cusparse does not have support to split between symbolic and numeric phases.
6962      When api_user is true, we don't need to update the numerical values
6963      of the temporary storage */
6964   PetscBool reusesym;
6965 
6966   /* support for COO values insertion */
6967   PetscScalar *coo_v, *coo_w; /* store on-process and off-process COO scalars, and used as MPI recv/send buffers respectively */
6968   PetscInt   **own;           /* own[i] points to address of on-process COO indices for Mat mp[i] */
6969   PetscInt   **off;           /* off[i] points to address of off-process COO indices for Mat mp[i] */
6970   PetscBool    hasoffproc;    /* if true, have off-process values insertion (i.e. AtB or PtAP) */
6971   PetscSF      sf;            /* used for non-local values insertion and memory malloc */
6972   PetscMemType mtype;
6973 
6974   /* customization */
6975   PetscBool abmerge;
6976   PetscBool P_oth_bind;
6977 } MatMatMPIAIJBACKEND;
6978 
6979 static PetscErrorCode MatProductCtxDestroy_MatMatMPIAIJBACKEND(void **data)
6980 {
6981   MatMatMPIAIJBACKEND *mmdata = *(MatMatMPIAIJBACKEND **)data;
6982   PetscInt             i;
6983 
6984   PetscFunctionBegin;
6985   PetscCall(PetscFree2(mmdata->startsj_s, mmdata->startsj_r));
6986   PetscCall(PetscFree(mmdata->bufa));
6987   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_v));
6988   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_w));
6989   PetscCall(MatDestroy(&mmdata->P_oth));
6990   PetscCall(MatDestroy(&mmdata->Bloc));
6991   PetscCall(PetscSFDestroy(&mmdata->sf));
6992   for (i = 0; i < mmdata->cp; i++) PetscCall(MatDestroy(&mmdata->mp[i]));
6993   PetscCall(PetscFree2(mmdata->mp, mmdata->mptmp));
6994   PetscCall(PetscFree(mmdata->own[0]));
6995   PetscCall(PetscFree(mmdata->own));
6996   PetscCall(PetscFree(mmdata->off[0]));
6997   PetscCall(PetscFree(mmdata->off));
6998   PetscCall(PetscFree(mmdata));
6999   PetscFunctionReturn(PETSC_SUCCESS);
7000 }
7001 
7002 /* Copy selected n entries with indices in idx[] of A to v[].
7003    If idx is NULL, copy the whole data array of A to v[]
7004  */
7005 static PetscErrorCode MatSeqAIJCopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
7006 {
7007   PetscErrorCode (*f)(Mat, PetscInt, const PetscInt[], PetscScalar[]);
7008 
7009   PetscFunctionBegin;
7010   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", &f));
7011   if (f) {
7012     PetscCall((*f)(A, n, idx, v));
7013   } else {
7014     const PetscScalar *vv;
7015 
7016     PetscCall(MatSeqAIJGetArrayRead(A, &vv));
7017     if (n && idx) {
7018       PetscScalar    *w  = v;
7019       const PetscInt *oi = idx;
7020       PetscInt        j;
7021 
7022       for (j = 0; j < n; j++) *w++ = vv[*oi++];
7023     } else {
7024       PetscCall(PetscArraycpy(v, vv, n));
7025     }
7026     PetscCall(MatSeqAIJRestoreArrayRead(A, &vv));
7027   }
7028   PetscFunctionReturn(PETSC_SUCCESS);
7029 }
7030 
7031 static PetscErrorCode MatProductNumeric_MPIAIJBACKEND(Mat C)
7032 {
7033   MatMatMPIAIJBACKEND *mmdata;
7034   PetscInt             i, n_d, n_o;
7035 
7036   PetscFunctionBegin;
7037   MatCheckProduct(C, 1);
7038   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
7039   mmdata = (MatMatMPIAIJBACKEND *)C->product->data;
7040   if (!mmdata->reusesym) { /* update temporary matrices */
7041     if (mmdata->P_oth) PetscCall(MatGetBrowsOfAoCols_MPIAIJ(C->product->A, C->product->B, MAT_REUSE_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7042     if (mmdata->Bloc) PetscCall(MatMPIAIJGetLocalMatMerge(C->product->B, MAT_REUSE_MATRIX, NULL, &mmdata->Bloc));
7043   }
7044   mmdata->reusesym = PETSC_FALSE;
7045 
7046   for (i = 0; i < mmdata->cp; i++) {
7047     PetscCheck(mmdata->mp[i]->ops->productnumeric, PetscObjectComm((PetscObject)mmdata->mp[i]), PETSC_ERR_PLIB, "Missing numeric op for %s", MatProductTypes[mmdata->mp[i]->product->type]);
7048     PetscCall((*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]));
7049   }
7050   for (i = 0, n_d = 0, n_o = 0; i < mmdata->cp; i++) {
7051     PetscInt noff;
7052 
7053     PetscCall(PetscIntCast(mmdata->off[i + 1] - mmdata->off[i], &noff));
7054     if (mmdata->mptmp[i]) continue;
7055     if (noff) {
7056       PetscInt nown;
7057 
7058       PetscCall(PetscIntCast(mmdata->own[i + 1] - mmdata->own[i], &nown));
7059       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], noff, mmdata->off[i], mmdata->coo_w + n_o));
7060       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], nown, mmdata->own[i], mmdata->coo_v + n_d));
7061       n_o += noff;
7062       n_d += nown;
7063     } else {
7064       Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mmdata->mp[i]->data;
7065 
7066       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], mm->nz, NULL, mmdata->coo_v + n_d));
7067       n_d += mm->nz;
7068     }
7069   }
7070   if (mmdata->hasoffproc) { /* offprocess insertion */
7071     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7072     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7073   }
7074   PetscCall(MatSetValuesCOO(C, mmdata->coo_v, INSERT_VALUES));
7075   PetscFunctionReturn(PETSC_SUCCESS);
7076 }
7077 
7078 /* Support for Pt * A, A * P, or Pt * A * P */
7079 #define MAX_NUMBER_INTERMEDIATE 4
7080 PetscErrorCode MatProductSymbolic_MPIAIJBACKEND(Mat C)
7081 {
7082   Mat_Product           *product = C->product;
7083   Mat                    A, P, mp[MAX_NUMBER_INTERMEDIATE]; /* A, P and a series of intermediate matrices */
7084   Mat_MPIAIJ            *a, *p;
7085   MatMatMPIAIJBACKEND   *mmdata;
7086   ISLocalToGlobalMapping P_oth_l2g = NULL;
7087   IS                     glob      = NULL;
7088   const char            *prefix;
7089   char                   pprefix[256];
7090   const PetscInt        *globidx, *P_oth_idx;
7091   PetscInt               i, j, cp, m, n, M, N, *coo_i, *coo_j;
7092   PetscCount             ncoo, ncoo_d, ncoo_o, ncoo_oown;
7093   PetscInt               cmapt[MAX_NUMBER_INTERMEDIATE], rmapt[MAX_NUMBER_INTERMEDIATE]; /* col/row map type for each Mat in mp[]. */
7094                                                                                          /* type-0: consecutive, start from 0; type-1: consecutive with */
7095                                                                                          /* a base offset; type-2: sparse with a local to global map table */
7096   const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE], *rmapa[MAX_NUMBER_INTERMEDIATE];       /* col/row local to global map array (table) for type-2 map type */
7097 
7098   MatProductType ptype;
7099   PetscBool      mptmp[MAX_NUMBER_INTERMEDIATE], hasoffproc = PETSC_FALSE, iscuda, iship, iskokk;
7100   PetscMPIInt    size;
7101 
7102   PetscFunctionBegin;
7103   MatCheckProduct(C, 1);
7104   PetscCheck(!product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
7105   ptype = product->type;
7106   if (product->A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
7107     ptype                                          = MATPRODUCT_AB;
7108     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
7109   }
7110   switch (ptype) {
7111   case MATPRODUCT_AB:
7112     A          = product->A;
7113     P          = product->B;
7114     m          = A->rmap->n;
7115     n          = P->cmap->n;
7116     M          = A->rmap->N;
7117     N          = P->cmap->N;
7118     hasoffproc = PETSC_FALSE; /* will not scatter mat product values to other processes */
7119     break;
7120   case MATPRODUCT_AtB:
7121     P          = product->A;
7122     A          = product->B;
7123     m          = P->cmap->n;
7124     n          = A->cmap->n;
7125     M          = P->cmap->N;
7126     N          = A->cmap->N;
7127     hasoffproc = PETSC_TRUE;
7128     break;
7129   case MATPRODUCT_PtAP:
7130     A          = product->A;
7131     P          = product->B;
7132     m          = P->cmap->n;
7133     n          = P->cmap->n;
7134     M          = P->cmap->N;
7135     N          = P->cmap->N;
7136     hasoffproc = PETSC_TRUE;
7137     break;
7138   default:
7139     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7140   }
7141   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)C), &size));
7142   if (size == 1) hasoffproc = PETSC_FALSE;
7143 
7144   /* defaults */
7145   for (i = 0; i < MAX_NUMBER_INTERMEDIATE; i++) {
7146     mp[i]    = NULL;
7147     mptmp[i] = PETSC_FALSE;
7148     rmapt[i] = -1;
7149     cmapt[i] = -1;
7150     rmapa[i] = NULL;
7151     cmapa[i] = NULL;
7152   }
7153 
7154   /* customization */
7155   PetscCall(PetscNew(&mmdata));
7156   mmdata->reusesym = product->api_user;
7157   if (ptype == MATPRODUCT_AB) {
7158     if (product->api_user) {
7159       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
7160       PetscCall(PetscOptionsBool("-matmatmult_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7161       PetscCall(PetscOptionsBool("-matmatmult_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7162       PetscOptionsEnd();
7163     } else {
7164       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
7165       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7166       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7167       PetscOptionsEnd();
7168     }
7169   } else if (ptype == MATPRODUCT_PtAP) {
7170     if (product->api_user) {
7171       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
7172       PetscCall(PetscOptionsBool("-matptap_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7173       PetscOptionsEnd();
7174     } else {
7175       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
7176       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7177       PetscOptionsEnd();
7178     }
7179   }
7180   a = (Mat_MPIAIJ *)A->data;
7181   p = (Mat_MPIAIJ *)P->data;
7182   PetscCall(MatSetSizes(C, m, n, M, N));
7183   PetscCall(PetscLayoutSetUp(C->rmap));
7184   PetscCall(PetscLayoutSetUp(C->cmap));
7185   PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
7186   PetscCall(MatGetOptionsPrefix(C, &prefix));
7187 
7188   cp = 0;
7189   switch (ptype) {
7190   case MATPRODUCT_AB: /* A * P */
7191     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7192 
7193     /* A_diag * P_local (merged or not) */
7194     if (mmdata->abmerge) { /* P's diagonal and off-diag blocks are merged to one matrix, then multiplied by A_diag */
7195       /* P is product->B */
7196       PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7197       PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7198       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7199       PetscCall(MatProductSetFill(mp[cp], product->fill));
7200       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7201       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7202       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7203       mp[cp]->product->api_user = product->api_user;
7204       PetscCall(MatProductSetFromOptions(mp[cp]));
7205       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7206       PetscCall(ISGetIndices(glob, &globidx));
7207       rmapt[cp] = 1;
7208       cmapt[cp] = 2;
7209       cmapa[cp] = globidx;
7210       mptmp[cp] = PETSC_FALSE;
7211       cp++;
7212     } else { /* A_diag * P_diag and A_diag * P_off */
7213       PetscCall(MatProductCreate(a->A, p->A, NULL, &mp[cp]));
7214       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7215       PetscCall(MatProductSetFill(mp[cp], product->fill));
7216       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7217       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7218       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7219       mp[cp]->product->api_user = product->api_user;
7220       PetscCall(MatProductSetFromOptions(mp[cp]));
7221       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7222       rmapt[cp] = 1;
7223       cmapt[cp] = 1;
7224       mptmp[cp] = PETSC_FALSE;
7225       cp++;
7226       PetscCall(MatProductCreate(a->A, p->B, NULL, &mp[cp]));
7227       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7228       PetscCall(MatProductSetFill(mp[cp], product->fill));
7229       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7230       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7231       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7232       mp[cp]->product->api_user = product->api_user;
7233       PetscCall(MatProductSetFromOptions(mp[cp]));
7234       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7235       rmapt[cp] = 1;
7236       cmapt[cp] = 2;
7237       cmapa[cp] = p->garray;
7238       mptmp[cp] = PETSC_FALSE;
7239       cp++;
7240     }
7241 
7242     /* A_off * P_other */
7243     if (mmdata->P_oth) {
7244       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g)); /* make P_oth use local col ids */
7245       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7246       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7247       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7248       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7249       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7250       PetscCall(MatProductSetFill(mp[cp], product->fill));
7251       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7252       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7253       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7254       mp[cp]->product->api_user = product->api_user;
7255       PetscCall(MatProductSetFromOptions(mp[cp]));
7256       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7257       rmapt[cp] = 1;
7258       cmapt[cp] = 2;
7259       cmapa[cp] = P_oth_idx;
7260       mptmp[cp] = PETSC_FALSE;
7261       cp++;
7262     }
7263     break;
7264 
7265   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
7266     /* A is product->B */
7267     PetscCall(MatMPIAIJGetLocalMatMerge(A, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7268     if (A == P) { /* when A==P, we can take advantage of the already merged mmdata->Bloc */
7269       PetscCall(MatProductCreate(mmdata->Bloc, mmdata->Bloc, NULL, &mp[cp]));
7270       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7271       PetscCall(MatProductSetFill(mp[cp], product->fill));
7272       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7273       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7274       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7275       mp[cp]->product->api_user = product->api_user;
7276       PetscCall(MatProductSetFromOptions(mp[cp]));
7277       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7278       PetscCall(ISGetIndices(glob, &globidx));
7279       rmapt[cp] = 2;
7280       rmapa[cp] = globidx;
7281       cmapt[cp] = 2;
7282       cmapa[cp] = globidx;
7283       mptmp[cp] = PETSC_FALSE;
7284       cp++;
7285     } else {
7286       PetscCall(MatProductCreate(p->A, mmdata->Bloc, NULL, &mp[cp]));
7287       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7288       PetscCall(MatProductSetFill(mp[cp], product->fill));
7289       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7290       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7291       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7292       mp[cp]->product->api_user = product->api_user;
7293       PetscCall(MatProductSetFromOptions(mp[cp]));
7294       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7295       PetscCall(ISGetIndices(glob, &globidx));
7296       rmapt[cp] = 1;
7297       cmapt[cp] = 2;
7298       cmapa[cp] = globidx;
7299       mptmp[cp] = PETSC_FALSE;
7300       cp++;
7301       PetscCall(MatProductCreate(p->B, mmdata->Bloc, NULL, &mp[cp]));
7302       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7303       PetscCall(MatProductSetFill(mp[cp], product->fill));
7304       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7305       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7306       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7307       mp[cp]->product->api_user = product->api_user;
7308       PetscCall(MatProductSetFromOptions(mp[cp]));
7309       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7310       rmapt[cp] = 2;
7311       rmapa[cp] = p->garray;
7312       cmapt[cp] = 2;
7313       cmapa[cp] = globidx;
7314       mptmp[cp] = PETSC_FALSE;
7315       cp++;
7316     }
7317     break;
7318   case MATPRODUCT_PtAP:
7319     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7320     /* P is product->B */
7321     PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7322     PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7323     PetscCall(MatProductSetType(mp[cp], MATPRODUCT_PtAP));
7324     PetscCall(MatProductSetFill(mp[cp], product->fill));
7325     PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7326     PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7327     PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7328     mp[cp]->product->api_user = product->api_user;
7329     PetscCall(MatProductSetFromOptions(mp[cp]));
7330     PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7331     PetscCall(ISGetIndices(glob, &globidx));
7332     rmapt[cp] = 2;
7333     rmapa[cp] = globidx;
7334     cmapt[cp] = 2;
7335     cmapa[cp] = globidx;
7336     mptmp[cp] = PETSC_FALSE;
7337     cp++;
7338     if (mmdata->P_oth) {
7339       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g));
7340       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7341       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7342       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7343       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7344       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7345       PetscCall(MatProductSetFill(mp[cp], product->fill));
7346       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7347       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7348       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7349       mp[cp]->product->api_user = product->api_user;
7350       PetscCall(MatProductSetFromOptions(mp[cp]));
7351       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7352       mptmp[cp] = PETSC_TRUE;
7353       cp++;
7354       PetscCall(MatProductCreate(mmdata->Bloc, mp[1], NULL, &mp[cp]));
7355       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7356       PetscCall(MatProductSetFill(mp[cp], product->fill));
7357       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7358       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7359       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7360       mp[cp]->product->api_user = product->api_user;
7361       PetscCall(MatProductSetFromOptions(mp[cp]));
7362       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7363       rmapt[cp] = 2;
7364       rmapa[cp] = globidx;
7365       cmapt[cp] = 2;
7366       cmapa[cp] = P_oth_idx;
7367       mptmp[cp] = PETSC_FALSE;
7368       cp++;
7369     }
7370     break;
7371   default:
7372     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7373   }
7374   /* sanity check */
7375   if (size > 1)
7376     for (i = 0; i < cp; i++) PetscCheck(rmapt[i] != 2 || hasoffproc, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected offproc map type for product %" PetscInt_FMT, i);
7377 
7378   PetscCall(PetscMalloc2(cp, &mmdata->mp, cp, &mmdata->mptmp));
7379   for (i = 0; i < cp; i++) {
7380     mmdata->mp[i]    = mp[i];
7381     mmdata->mptmp[i] = mptmp[i];
7382   }
7383   mmdata->cp             = cp;
7384   C->product->data       = mmdata;
7385   C->product->destroy    = MatProductCtxDestroy_MatMatMPIAIJBACKEND;
7386   C->ops->productnumeric = MatProductNumeric_MPIAIJBACKEND;
7387 
7388   /* memory type */
7389   mmdata->mtype = PETSC_MEMTYPE_HOST;
7390   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iscuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, ""));
7391   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iship, MATSEQAIJHIPSPARSE, MATMPIAIJHIPSPARSE, ""));
7392   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iskokk, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, ""));
7393   if (iscuda) mmdata->mtype = PETSC_MEMTYPE_CUDA;
7394   else if (iship) mmdata->mtype = PETSC_MEMTYPE_HIP;
7395   else if (iskokk) mmdata->mtype = PETSC_MEMTYPE_KOKKOS;
7396 
7397   /* prepare coo coordinates for values insertion */
7398 
7399   /* count total nonzeros of those intermediate seqaij Mats
7400     ncoo_d:    # of nonzeros of matrices that do not have offproc entries
7401     ncoo_o:    # of nonzeros (of matrices that might have offproc entries) that will be inserted to remote procs
7402     ncoo_oown: # of nonzeros (of matrices that might have offproc entries) that will be inserted locally
7403   */
7404   for (cp = 0, ncoo_d = 0, ncoo_o = 0, ncoo_oown = 0; cp < mmdata->cp; cp++) {
7405     Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mp[cp]->data;
7406     if (mptmp[cp]) continue;
7407     if (rmapt[cp] == 2 && hasoffproc) { /* the rows need to be scatter to all processes (might include self) */
7408       const PetscInt *rmap = rmapa[cp];
7409       const PetscInt  mr   = mp[cp]->rmap->n;
7410       const PetscInt  rs   = C->rmap->rstart;
7411       const PetscInt  re   = C->rmap->rend;
7412       const PetscInt *ii   = mm->i;
7413       for (i = 0; i < mr; i++) {
7414         const PetscInt gr = rmap[i];
7415         const PetscInt nz = ii[i + 1] - ii[i];
7416         if (gr < rs || gr >= re) ncoo_o += nz; /* this row is offproc */
7417         else ncoo_oown += nz;                  /* this row is local */
7418       }
7419     } else ncoo_d += mm->nz;
7420   }
7421 
7422   /*
7423     ncoo: total number of nonzeros (including those inserted by remote procs) belonging to this proc
7424 
7425     ncoo = ncoo_d + ncoo_oown + ncoo2, which ncoo2 is number of nonzeros inserted to me by other procs.
7426 
7427     off[0] points to a big index array, which is shared by off[1,2,...]. Similarly, for own[0].
7428 
7429     off[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert to others
7430     own[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert locally
7431     so, off[p+1]-off[p] is the number of nonzeros that mp[p] will send to others.
7432 
7433     coo_i/j/v[]: [ncoo] row/col/val of nonzeros belonging to this proc.
7434     Ex. coo_i[]: the beginning part (of size ncoo_d + ncoo_oown) stores i of local nonzeros, and the remaining part stores i of nonzeros I will receive.
7435   */
7436   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->off)); /* +1 to make a csr-like data structure */
7437   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->own));
7438 
7439   /* gather (i,j) of nonzeros inserted by remote procs */
7440   if (hasoffproc) {
7441     PetscSF  msf;
7442     PetscInt ncoo2, *coo_i2, *coo_j2;
7443 
7444     PetscCall(PetscMalloc1(ncoo_o, &mmdata->off[0]));
7445     PetscCall(PetscMalloc1(ncoo_oown, &mmdata->own[0]));
7446     PetscCall(PetscMalloc2(ncoo_o, &coo_i, ncoo_o, &coo_j)); /* to collect (i,j) of entries to be sent to others */
7447 
7448     for (cp = 0, ncoo_o = 0; cp < mmdata->cp; cp++) {
7449       Mat_SeqAIJ *mm     = (Mat_SeqAIJ *)mp[cp]->data;
7450       PetscInt   *idxoff = mmdata->off[cp];
7451       PetscInt   *idxown = mmdata->own[cp];
7452       if (!mptmp[cp] && rmapt[cp] == 2) { /* row map is sparse */
7453         const PetscInt *rmap = rmapa[cp];
7454         const PetscInt *cmap = cmapa[cp];
7455         const PetscInt *ii   = mm->i;
7456         PetscInt       *coi  = coo_i + ncoo_o;
7457         PetscInt       *coj  = coo_j + ncoo_o;
7458         const PetscInt  mr   = mp[cp]->rmap->n;
7459         const PetscInt  rs   = C->rmap->rstart;
7460         const PetscInt  re   = C->rmap->rend;
7461         const PetscInt  cs   = C->cmap->rstart;
7462         for (i = 0; i < mr; i++) {
7463           const PetscInt *jj = mm->j + ii[i];
7464           const PetscInt  gr = rmap[i];
7465           const PetscInt  nz = ii[i + 1] - ii[i];
7466           if (gr < rs || gr >= re) { /* this is an offproc row */
7467             for (j = ii[i]; j < ii[i + 1]; j++) {
7468               *coi++    = gr;
7469               *idxoff++ = j;
7470             }
7471             if (!cmapt[cp]) { /* already global */
7472               for (j = 0; j < nz; j++) *coj++ = jj[j];
7473             } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7474               for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7475             } else { /* offdiag */
7476               for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7477             }
7478             ncoo_o += nz;
7479           } else { /* this is a local row */
7480             for (j = ii[i]; j < ii[i + 1]; j++) *idxown++ = j;
7481           }
7482         }
7483       }
7484       mmdata->off[cp + 1] = idxoff;
7485       mmdata->own[cp + 1] = idxown;
7486     }
7487 
7488     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7489     PetscInt incoo_o;
7490     PetscCall(PetscIntCast(ncoo_o, &incoo_o));
7491     PetscCall(PetscSFSetGraphLayout(mmdata->sf, C->rmap, incoo_o /*nleaves*/, NULL /*ilocal*/, PETSC_OWN_POINTER, coo_i));
7492     PetscCall(PetscSFGetMultiSF(mmdata->sf, &msf));
7493     PetscCall(PetscSFGetGraph(msf, &ncoo2 /*nroots*/, NULL, NULL, NULL));
7494     ncoo = ncoo_d + ncoo_oown + ncoo2;
7495     PetscCall(PetscMalloc2(ncoo, &coo_i2, ncoo, &coo_j2));
7496     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown)); /* put (i,j) of remote nonzeros at back */
7497     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown));
7498     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7499     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7500     PetscCall(PetscFree2(coo_i, coo_j));
7501     /* allocate MPI send buffer to collect nonzero values to be sent to remote procs */
7502     PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo_o * sizeof(PetscScalar), (void **)&mmdata->coo_w));
7503     coo_i = coo_i2;
7504     coo_j = coo_j2;
7505   } else { /* no offproc values insertion */
7506     ncoo = ncoo_d;
7507     PetscCall(PetscMalloc2(ncoo, &coo_i, ncoo, &coo_j));
7508 
7509     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7510     PetscCall(PetscSFSetGraph(mmdata->sf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
7511     PetscCall(PetscSFSetUp(mmdata->sf));
7512   }
7513   mmdata->hasoffproc = hasoffproc;
7514 
7515   /* gather (i,j) of nonzeros inserted locally */
7516   for (cp = 0, ncoo_d = 0; cp < mmdata->cp; cp++) {
7517     Mat_SeqAIJ     *mm   = (Mat_SeqAIJ *)mp[cp]->data;
7518     PetscInt       *coi  = coo_i + ncoo_d;
7519     PetscInt       *coj  = coo_j + ncoo_d;
7520     const PetscInt *jj   = mm->j;
7521     const PetscInt *ii   = mm->i;
7522     const PetscInt *cmap = cmapa[cp];
7523     const PetscInt *rmap = rmapa[cp];
7524     const PetscInt  mr   = mp[cp]->rmap->n;
7525     const PetscInt  rs   = C->rmap->rstart;
7526     const PetscInt  re   = C->rmap->rend;
7527     const PetscInt  cs   = C->cmap->rstart;
7528 
7529     if (mptmp[cp]) continue;
7530     if (rmapt[cp] == 1) { /* consecutive rows */
7531       /* fill coo_i */
7532       for (i = 0; i < mr; i++) {
7533         const PetscInt gr = i + rs;
7534         for (j = ii[i]; j < ii[i + 1]; j++) coi[j] = gr;
7535       }
7536       /* fill coo_j */
7537       if (!cmapt[cp]) { /* type-0, already global */
7538         PetscCall(PetscArraycpy(coj, jj, mm->nz));
7539       } else if (cmapt[cp] == 1) {                        /* type-1, local to global for consecutive columns of C */
7540         for (j = 0; j < mm->nz; j++) coj[j] = jj[j] + cs; /* lid + col start */
7541       } else {                                            /* type-2, local to global for sparse columns */
7542         for (j = 0; j < mm->nz; j++) coj[j] = cmap[jj[j]];
7543       }
7544       ncoo_d += mm->nz;
7545     } else if (rmapt[cp] == 2) { /* sparse rows */
7546       for (i = 0; i < mr; i++) {
7547         const PetscInt *jj = mm->j + ii[i];
7548         const PetscInt  gr = rmap[i];
7549         const PetscInt  nz = ii[i + 1] - ii[i];
7550         if (gr >= rs && gr < re) { /* local rows */
7551           for (j = ii[i]; j < ii[i + 1]; j++) *coi++ = gr;
7552           if (!cmapt[cp]) { /* type-0, already global */
7553             for (j = 0; j < nz; j++) *coj++ = jj[j];
7554           } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7555             for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7556           } else { /* type-2, local to global for sparse columns */
7557             for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7558           }
7559           ncoo_d += nz;
7560         }
7561       }
7562     }
7563   }
7564   if (glob) PetscCall(ISRestoreIndices(glob, &globidx));
7565   PetscCall(ISDestroy(&glob));
7566   if (P_oth_l2g) PetscCall(ISLocalToGlobalMappingRestoreIndices(P_oth_l2g, &P_oth_idx));
7567   PetscCall(ISLocalToGlobalMappingDestroy(&P_oth_l2g));
7568   /* allocate an array to store all nonzeros (inserted locally or remotely) belonging to this proc */
7569   PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo * sizeof(PetscScalar), (void **)&mmdata->coo_v));
7570 
7571   /* set block sizes */
7572   A = product->A;
7573   P = product->B;
7574   switch (ptype) {
7575   case MATPRODUCT_PtAP:
7576     PetscCall(MatSetBlockSizes(C, P->cmap->bs, P->cmap->bs));
7577     break;
7578   case MATPRODUCT_RARt:
7579     PetscCall(MatSetBlockSizes(C, P->rmap->bs, P->rmap->bs));
7580     break;
7581   case MATPRODUCT_ABC:
7582     PetscCall(MatSetBlockSizesFromMats(C, A, product->C));
7583     break;
7584   case MATPRODUCT_AB:
7585     PetscCall(MatSetBlockSizesFromMats(C, A, P));
7586     break;
7587   case MATPRODUCT_AtB:
7588     PetscCall(MatSetBlockSizes(C, A->cmap->bs, P->cmap->bs));
7589     break;
7590   case MATPRODUCT_ABt:
7591     PetscCall(MatSetBlockSizes(C, A->rmap->bs, P->rmap->bs));
7592     break;
7593   default:
7594     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for ProductType %s", MatProductTypes[ptype]);
7595   }
7596 
7597   /* preallocate with COO data */
7598   PetscCall(MatSetPreallocationCOO(C, ncoo, coo_i, coo_j));
7599   PetscCall(PetscFree2(coo_i, coo_j));
7600   PetscFunctionReturn(PETSC_SUCCESS);
7601 }
7602 
7603 PetscErrorCode MatProductSetFromOptions_MPIAIJBACKEND(Mat mat)
7604 {
7605   Mat_Product *product = mat->product;
7606 #if defined(PETSC_HAVE_DEVICE)
7607   PetscBool match  = PETSC_FALSE;
7608   PetscBool usecpu = PETSC_FALSE;
7609 #else
7610   PetscBool match = PETSC_TRUE;
7611 #endif
7612 
7613   PetscFunctionBegin;
7614   MatCheckProduct(mat, 1);
7615 #if defined(PETSC_HAVE_DEVICE)
7616   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match));
7617   if (match) { /* we can always fallback to the CPU if requested */
7618     switch (product->type) {
7619     case MATPRODUCT_AB:
7620       if (product->api_user) {
7621         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
7622         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7623         PetscOptionsEnd();
7624       } else {
7625         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
7626         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7627         PetscOptionsEnd();
7628       }
7629       break;
7630     case MATPRODUCT_AtB:
7631       if (product->api_user) {
7632         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
7633         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7634         PetscOptionsEnd();
7635       } else {
7636         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
7637         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7638         PetscOptionsEnd();
7639       }
7640       break;
7641     case MATPRODUCT_PtAP:
7642       if (product->api_user) {
7643         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
7644         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7645         PetscOptionsEnd();
7646       } else {
7647         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
7648         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7649         PetscOptionsEnd();
7650       }
7651       break;
7652     default:
7653       break;
7654     }
7655     match = (PetscBool)!usecpu;
7656   }
7657 #endif
7658   if (match) {
7659     switch (product->type) {
7660     case MATPRODUCT_AB:
7661     case MATPRODUCT_AtB:
7662     case MATPRODUCT_PtAP:
7663       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJBACKEND;
7664       break;
7665     default:
7666       break;
7667     }
7668   }
7669   /* fallback to MPIAIJ ops */
7670   if (!mat->ops->productsymbolic) PetscCall(MatProductSetFromOptions_MPIAIJ(mat));
7671   PetscFunctionReturn(PETSC_SUCCESS);
7672 }
7673 
7674 /*
7675    Produces a set of block column indices of the matrix row, one for each block represented in the original row
7676 
7677    n - the number of block indices in cc[]
7678    cc - the block indices (must be large enough to contain the indices)
7679 */
7680 static inline PetscErrorCode MatCollapseRow(Mat Amat, PetscInt row, PetscInt bs, PetscInt *n, PetscInt *cc)
7681 {
7682   PetscInt        cnt = -1, nidx, j;
7683   const PetscInt *idx;
7684 
7685   PetscFunctionBegin;
7686   PetscCall(MatGetRow(Amat, row, &nidx, &idx, NULL));
7687   if (nidx) {
7688     cnt     = 0;
7689     cc[cnt] = idx[0] / bs;
7690     for (j = 1; j < nidx; j++) {
7691       if (cc[cnt] < idx[j] / bs) cc[++cnt] = idx[j] / bs;
7692     }
7693   }
7694   PetscCall(MatRestoreRow(Amat, row, &nidx, &idx, NULL));
7695   *n = cnt + 1;
7696   PetscFunctionReturn(PETSC_SUCCESS);
7697 }
7698 
7699 /*
7700     Produces a set of block column indices of the matrix block row, one for each block represented in the original set of rows
7701 
7702     ncollapsed - the number of block indices
7703     collapsed - the block indices (must be large enough to contain the indices)
7704 */
7705 static inline PetscErrorCode MatCollapseRows(Mat Amat, PetscInt start, PetscInt bs, PetscInt *w0, PetscInt *w1, PetscInt *w2, PetscInt *ncollapsed, PetscInt **collapsed)
7706 {
7707   PetscInt i, nprev, *cprev = w0, ncur = 0, *ccur = w1, *merged = w2, *cprevtmp;
7708 
7709   PetscFunctionBegin;
7710   PetscCall(MatCollapseRow(Amat, start, bs, &nprev, cprev));
7711   for (i = start + 1; i < start + bs; i++) {
7712     PetscCall(MatCollapseRow(Amat, i, bs, &ncur, ccur));
7713     PetscCall(PetscMergeIntArray(nprev, cprev, ncur, ccur, &nprev, &merged));
7714     cprevtmp = cprev;
7715     cprev    = merged;
7716     merged   = cprevtmp;
7717   }
7718   *ncollapsed = nprev;
7719   if (collapsed) *collapsed = cprev;
7720   PetscFunctionReturn(PETSC_SUCCESS);
7721 }
7722 
7723 /*
7724  MatCreateGraph_Simple_AIJ - create simple scalar matrix (graph) from potentially blocked matrix
7725 
7726  Input Parameter:
7727  . Amat - matrix
7728  - symmetrize - make the result symmetric
7729  + scale - scale with diagonal
7730 
7731  Output Parameter:
7732  . a_Gmat - output scalar graph >= 0
7733 
7734 */
7735 PETSC_INTERN PetscErrorCode MatCreateGraph_Simple_AIJ(Mat Amat, PetscBool symmetrize, PetscBool scale, PetscReal filter, PetscInt index_size, PetscInt index[], Mat *a_Gmat)
7736 {
7737   PetscInt  Istart, Iend, Ii, jj, kk, ncols, nloc, NN, MM, bs;
7738   MPI_Comm  comm;
7739   Mat       Gmat;
7740   PetscBool ismpiaij, isseqaij;
7741   Mat       a, b, c;
7742   MatType   jtype;
7743 
7744   PetscFunctionBegin;
7745   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
7746   PetscCall(MatGetOwnershipRange(Amat, &Istart, &Iend));
7747   PetscCall(MatGetSize(Amat, &MM, &NN));
7748   PetscCall(MatGetBlockSize(Amat, &bs));
7749   nloc = (Iend - Istart) / bs;
7750 
7751   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATSEQAIJ, &isseqaij));
7752   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATMPIAIJ, &ismpiaij));
7753   PetscCheck(isseqaij || ismpiaij, comm, PETSC_ERR_USER, "Require (MPI)AIJ matrix type");
7754 
7755   /* TODO GPU: these calls are potentially expensive if matrices are large and we want to use the GPU */
7756   /* A solution consists in providing a new API, MatAIJGetCollapsedAIJ, and each class can provide a fast
7757      implementation */
7758   if (bs > 1) {
7759     PetscCall(MatGetType(Amat, &jtype));
7760     PetscCall(MatCreate(comm, &Gmat));
7761     PetscCall(MatSetType(Gmat, jtype));
7762     PetscCall(MatSetSizes(Gmat, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
7763     PetscCall(MatSetBlockSizes(Gmat, 1, 1));
7764     if (isseqaij || ((Mat_MPIAIJ *)Amat->data)->garray) {
7765       PetscInt  *d_nnz, *o_nnz;
7766       MatScalar *aa, val, *AA;
7767       PetscInt  *aj, *ai, *AJ, nc, nmax = 0;
7768 
7769       if (isseqaij) {
7770         a = Amat;
7771         b = NULL;
7772       } else {
7773         Mat_MPIAIJ *d = (Mat_MPIAIJ *)Amat->data;
7774         a             = d->A;
7775         b             = d->B;
7776       }
7777       PetscCall(PetscInfo(Amat, "New bs>1 Graph. nloc=%" PetscInt_FMT "\n", nloc));
7778       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7779       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7780         PetscInt       *nnz = (c == a) ? d_nnz : o_nnz;
7781         const PetscInt *cols1, *cols2;
7782 
7783         for (PetscInt brow = 0, nc1, nc2, ok = 1; brow < nloc * bs; brow += bs) { // block rows
7784           PetscCall(MatGetRow(c, brow, &nc2, &cols2, NULL));
7785           nnz[brow / bs] = nc2 / bs;
7786           if (nc2 % bs) ok = 0;
7787           if (nnz[brow / bs] > nmax) nmax = nnz[brow / bs];
7788           for (PetscInt ii = 1; ii < bs; ii++) { // check for non-dense blocks
7789             PetscCall(MatGetRow(c, brow + ii, &nc1, &cols1, NULL));
7790             if (nc1 != nc2) ok = 0;
7791             else {
7792               for (PetscInt jj = 0; jj < nc1 && ok == 1; jj++) {
7793                 if (cols1[jj] != cols2[jj]) ok = 0;
7794                 if (cols1[jj] % bs != jj % bs) ok = 0;
7795               }
7796             }
7797             PetscCall(MatRestoreRow(c, brow + ii, &nc1, &cols1, NULL));
7798           }
7799           PetscCall(MatRestoreRow(c, brow, &nc2, &cols2, NULL));
7800           if (!ok) {
7801             PetscCall(PetscFree2(d_nnz, o_nnz));
7802             PetscCall(PetscInfo(Amat, "Found sparse blocks - revert to slow method\n"));
7803             goto old_bs;
7804           }
7805         }
7806       }
7807       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7808       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7809       PetscCall(PetscFree2(d_nnz, o_nnz));
7810       PetscCall(PetscMalloc2(nmax, &AA, nmax, &AJ));
7811       // diag
7812       for (PetscInt brow = 0, n, grow; brow < nloc * bs; brow += bs) { // block rows
7813         Mat_SeqAIJ *aseq = (Mat_SeqAIJ *)a->data;
7814 
7815         ai = aseq->i;
7816         n  = ai[brow + 1] - ai[brow];
7817         aj = aseq->j + ai[brow];
7818         for (PetscInt k = 0; k < n; k += bs) {   // block columns
7819           AJ[k / bs] = aj[k] / bs + Istart / bs; // diag starts at (Istart,Istart)
7820           val        = 0;
7821           if (index_size == 0) {
7822             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7823               aa = aseq->a + ai[brow + ii] + k;
7824               for (PetscInt jj = 0; jj < bs; jj++) {    // columns in block
7825                 val += PetscAbs(PetscRealPart(aa[jj])); // a sort of norm
7826               }
7827             }
7828           } else {                                            // use (index,index) value if provided
7829             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7830               PetscInt ii = index[iii];
7831               aa          = aseq->a + ai[brow + ii] + k;
7832               for (PetscInt jjj = 0; jjj < index_size; jjj++) { // columns in block
7833                 PetscInt jj = index[jjj];
7834                 val += PetscAbs(PetscRealPart(aa[jj]));
7835               }
7836             }
7837           }
7838           PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7839           AA[k / bs] = val;
7840         }
7841         grow = Istart / bs + brow / bs;
7842         PetscCall(MatSetValues(Gmat, 1, &grow, n / bs, AJ, AA, ADD_VALUES));
7843       }
7844       // off-diag
7845       if (ismpiaij) {
7846         Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)Amat->data;
7847         const PetscScalar *vals;
7848         const PetscInt    *cols, *garray = aij->garray;
7849 
7850         PetscCheck(garray, PETSC_COMM_SELF, PETSC_ERR_USER, "No garray ?");
7851         for (PetscInt brow = 0, grow; brow < nloc * bs; brow += bs) { // block rows
7852           PetscCall(MatGetRow(b, brow, &ncols, &cols, NULL));
7853           for (PetscInt k = 0, cidx = 0; k < ncols; k += bs, cidx++) {
7854             PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs >= nmax");
7855             AA[k / bs] = 0;
7856             AJ[cidx]   = garray[cols[k]] / bs;
7857           }
7858           nc = ncols / bs;
7859           PetscCall(MatRestoreRow(b, brow, &ncols, &cols, NULL));
7860           if (index_size == 0) {
7861             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7862               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7863               for (PetscInt k = 0; k < ncols; k += bs) {
7864                 for (PetscInt jj = 0; jj < bs; jj++) { // cols in block
7865                   PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7866                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7867                 }
7868               }
7869               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7870             }
7871           } else {                                            // use (index,index) value if provided
7872             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7873               PetscInt ii = index[iii];
7874               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7875               for (PetscInt k = 0; k < ncols; k += bs) {
7876                 for (PetscInt jjj = 0; jjj < index_size; jjj++) { // cols in block
7877                   PetscInt jj = index[jjj];
7878                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7879                 }
7880               }
7881               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7882             }
7883           }
7884           grow = Istart / bs + brow / bs;
7885           PetscCall(MatSetValues(Gmat, 1, &grow, nc, AJ, AA, ADD_VALUES));
7886         }
7887       }
7888       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7889       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7890       PetscCall(PetscFree2(AA, AJ));
7891     } else {
7892       const PetscScalar *vals;
7893       const PetscInt    *idx;
7894       PetscInt          *d_nnz, *o_nnz, *w0, *w1, *w2;
7895     old_bs:
7896       /*
7897        Determine the preallocation needed for the scalar matrix derived from the vector matrix.
7898        */
7899       PetscCall(PetscInfo(Amat, "OLD bs>1 CreateGraph\n"));
7900       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7901       if (isseqaij) {
7902         PetscInt max_d_nnz;
7903 
7904         /*
7905          Determine exact preallocation count for (sequential) scalar matrix
7906          */
7907         PetscCall(MatSeqAIJGetMaxRowNonzeros(Amat, &max_d_nnz));
7908         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7909         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7910         for (Ii = 0, jj = 0; Ii < Iend; Ii += bs, jj++) PetscCall(MatCollapseRows(Amat, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7911         PetscCall(PetscFree3(w0, w1, w2));
7912       } else if (ismpiaij) {
7913         Mat             Daij, Oaij;
7914         const PetscInt *garray;
7915         PetscInt        max_d_nnz;
7916 
7917         PetscCall(MatMPIAIJGetSeqAIJ(Amat, &Daij, &Oaij, &garray));
7918         /*
7919          Determine exact preallocation count for diagonal block portion of scalar matrix
7920          */
7921         PetscCall(MatSeqAIJGetMaxRowNonzeros(Daij, &max_d_nnz));
7922         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7923         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7924         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) PetscCall(MatCollapseRows(Daij, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7925         PetscCall(PetscFree3(w0, w1, w2));
7926         /*
7927          Over estimate (usually grossly over), preallocation count for off-diagonal portion of scalar matrix
7928          */
7929         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) {
7930           o_nnz[jj] = 0;
7931           for (kk = 0; kk < bs; kk++) { /* rows that get collapsed to a single row */
7932             PetscCall(MatGetRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7933             o_nnz[jj] += ncols;
7934             PetscCall(MatRestoreRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7935           }
7936           if (o_nnz[jj] > (NN / bs - nloc)) o_nnz[jj] = NN / bs - nloc;
7937         }
7938       } else SETERRQ(comm, PETSC_ERR_USER, "Require AIJ matrix type");
7939       /* get scalar copy (norms) of matrix */
7940       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7941       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7942       PetscCall(PetscFree2(d_nnz, o_nnz));
7943       for (Ii = Istart; Ii < Iend; Ii++) {
7944         PetscInt dest_row = Ii / bs;
7945 
7946         PetscCall(MatGetRow(Amat, Ii, &ncols, &idx, &vals));
7947         for (jj = 0; jj < ncols; jj++) {
7948           PetscInt    dest_col = idx[jj] / bs;
7949           PetscScalar sv       = PetscAbs(PetscRealPart(vals[jj]));
7950 
7951           PetscCall(MatSetValues(Gmat, 1, &dest_row, 1, &dest_col, &sv, ADD_VALUES));
7952         }
7953         PetscCall(MatRestoreRow(Amat, Ii, &ncols, &idx, &vals));
7954       }
7955       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7956       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7957     }
7958   } else {
7959     if (symmetrize || filter >= 0 || scale) PetscCall(MatDuplicate(Amat, MAT_COPY_VALUES, &Gmat));
7960     else {
7961       Gmat = Amat;
7962       PetscCall(PetscObjectReference((PetscObject)Gmat));
7963     }
7964     if (isseqaij) {
7965       a = Gmat;
7966       b = NULL;
7967     } else {
7968       Mat_MPIAIJ *d = (Mat_MPIAIJ *)Gmat->data;
7969       a             = d->A;
7970       b             = d->B;
7971     }
7972     if (filter >= 0 || scale) {
7973       /* take absolute value of each entry */
7974       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7975         MatInfo      info;
7976         PetscScalar *avals;
7977 
7978         PetscCall(MatGetInfo(c, MAT_LOCAL, &info));
7979         PetscCall(MatSeqAIJGetArray(c, &avals));
7980         for (int jj = 0; jj < info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
7981         PetscCall(MatSeqAIJRestoreArray(c, &avals));
7982       }
7983     }
7984   }
7985   if (symmetrize) {
7986     PetscBool isset, issym;
7987 
7988     PetscCall(MatIsSymmetricKnown(Amat, &isset, &issym));
7989     if (!isset || !issym) {
7990       Mat matTrans;
7991 
7992       PetscCall(MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans));
7993       PetscCall(MatAXPY(Gmat, 1.0, matTrans, Gmat->structurally_symmetric == PETSC_BOOL3_TRUE ? SAME_NONZERO_PATTERN : DIFFERENT_NONZERO_PATTERN));
7994       PetscCall(MatDestroy(&matTrans));
7995     }
7996     PetscCall(MatSetOption(Gmat, MAT_SYMMETRIC, PETSC_TRUE));
7997   } else if (Amat != Gmat) PetscCall(MatPropagateSymmetryOptions(Amat, Gmat));
7998   if (scale) {
7999     /* scale c for all diagonal values = 1 or -1 */
8000     Vec diag;
8001 
8002     PetscCall(MatCreateVecs(Gmat, &diag, NULL));
8003     PetscCall(MatGetDiagonal(Gmat, diag));
8004     PetscCall(VecReciprocal(diag));
8005     PetscCall(VecSqrtAbs(diag));
8006     PetscCall(MatDiagonalScale(Gmat, diag, diag));
8007     PetscCall(VecDestroy(&diag));
8008   }
8009   PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_graph_view"));
8010   if (filter >= 0) {
8011     PetscCall(MatFilter(Gmat, filter, PETSC_TRUE, PETSC_TRUE));
8012     PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_filter_graph_view"));
8013   }
8014   *a_Gmat = Gmat;
8015   PetscFunctionReturn(PETSC_SUCCESS);
8016 }
8017 
8018 PETSC_INTERN PetscErrorCode MatGetCurrentMemType_MPIAIJ(Mat A, PetscMemType *memtype)
8019 {
8020   Mat_MPIAIJ  *mpiaij = (Mat_MPIAIJ *)A->data;
8021   PetscMemType mD = PETSC_MEMTYPE_HOST, mO = PETSC_MEMTYPE_HOST;
8022 
8023   PetscFunctionBegin;
8024   if (mpiaij->A) PetscCall(MatGetCurrentMemType(mpiaij->A, &mD));
8025   if (mpiaij->B) PetscCall(MatGetCurrentMemType(mpiaij->B, &mO));
8026   *memtype = (mD == mO) ? mD : PETSC_MEMTYPE_HOST;
8027   PetscFunctionReturn(PETSC_SUCCESS);
8028 }
8029 
8030 /*
8031     Special version for direct calls from Fortran
8032 */
8033 
8034 /* Change these macros so can be used in void function */
8035 /* Identical to PetscCallVoid, except it assigns to *_ierr */
8036 #undef PetscCall
8037 #define PetscCall(...) \
8038   do { \
8039     PetscErrorCode ierr_msv_mpiaij = __VA_ARGS__; \
8040     if (PetscUnlikely(ierr_msv_mpiaij)) { \
8041       *_ierr = PetscError(PETSC_COMM_SELF, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr_msv_mpiaij, PETSC_ERROR_REPEAT, " "); \
8042       return; \
8043     } \
8044   } while (0)
8045 
8046 #undef SETERRQ
8047 #define SETERRQ(comm, ierr, ...) \
8048   do { \
8049     *_ierr = PetscError(comm, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr, PETSC_ERROR_INITIAL, __VA_ARGS__); \
8050     return; \
8051   } while (0)
8052 
8053 #if defined(PETSC_HAVE_FORTRAN_CAPS)
8054   #define matsetvaluesmpiaij_ MATSETVALUESMPIAIJ
8055 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
8056   #define matsetvaluesmpiaij_ matsetvaluesmpiaij
8057 #else
8058 #endif
8059 PETSC_EXTERN void matsetvaluesmpiaij_(Mat *mmat, PetscInt *mm, const PetscInt im[], PetscInt *mn, const PetscInt in[], const PetscScalar v[], InsertMode *maddv, PetscErrorCode *_ierr)
8060 {
8061   Mat         mat = *mmat;
8062   PetscInt    m = *mm, n = *mn;
8063   InsertMode  addv = *maddv;
8064   Mat_MPIAIJ *aij  = (Mat_MPIAIJ *)mat->data;
8065   PetscScalar value;
8066 
8067   MatCheckPreallocated(mat, 1);
8068   if (mat->insertmode == NOT_SET_VALUES) mat->insertmode = addv;
8069   else PetscCheck(mat->insertmode == addv, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot mix add values and insert values");
8070   {
8071     PetscInt  i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
8072     PetscInt  cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
8073     PetscBool roworiented = aij->roworiented;
8074 
8075     /* Some Variables required in the macro */
8076     Mat         A     = aij->A;
8077     Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
8078     PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
8079     MatScalar  *aa;
8080     PetscBool   ignorezeroentries = ((a->ignorezeroentries && (addv == ADD_VALUES)) ? PETSC_TRUE : PETSC_FALSE);
8081     Mat         B                 = aij->B;
8082     Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
8083     PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
8084     MatScalar  *ba;
8085     /* This variable below is only for the PETSC_HAVE_VIENNACL or PETSC_HAVE_CUDA cases, but we define it in all cases because we
8086      * cannot use "#if defined" inside a macro. */
8087     PETSC_UNUSED PetscBool inserted = PETSC_FALSE;
8088 
8089     PetscInt  *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
8090     PetscInt   nonew = a->nonew;
8091     MatScalar *ap1, *ap2;
8092 
8093     PetscFunctionBegin;
8094     PetscCall(MatSeqAIJGetArray(A, &aa));
8095     PetscCall(MatSeqAIJGetArray(B, &ba));
8096     for (i = 0; i < m; i++) {
8097       if (im[i] < 0) continue;
8098       PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
8099       if (im[i] >= rstart && im[i] < rend) {
8100         row      = im[i] - rstart;
8101         lastcol1 = -1;
8102         rp1      = aj + ai[row];
8103         ap1      = aa + ai[row];
8104         rmax1    = aimax[row];
8105         nrow1    = ailen[row];
8106         low1     = 0;
8107         high1    = nrow1;
8108         lastcol2 = -1;
8109         rp2      = bj + bi[row];
8110         ap2      = ba + bi[row];
8111         rmax2    = bimax[row];
8112         nrow2    = bilen[row];
8113         low2     = 0;
8114         high2    = nrow2;
8115 
8116         for (j = 0; j < n; j++) {
8117           if (roworiented) value = v[i * n + j];
8118           else value = v[i + j * m];
8119           if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
8120           if (in[j] >= cstart && in[j] < cend) {
8121             col = in[j] - cstart;
8122             MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
8123           } else if (in[j] < 0) continue;
8124           else if (PetscUnlikelyDebug(in[j] >= mat->cmap->N)) {
8125             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
8126           } else {
8127             if (mat->was_assembled) {
8128               if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
8129 #if defined(PETSC_USE_CTABLE)
8130               PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col));
8131               col--;
8132 #else
8133               col = aij->colmap[in[j]] - 1;
8134 #endif
8135               if (col < 0 && !((Mat_SeqAIJ *)aij->A->data)->nonew) {
8136                 PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
8137                 col = in[j];
8138                 /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
8139                 B        = aij->B;
8140                 b        = (Mat_SeqAIJ *)B->data;
8141                 bimax    = b->imax;
8142                 bi       = b->i;
8143                 bilen    = b->ilen;
8144                 bj       = b->j;
8145                 rp2      = bj + bi[row];
8146                 ap2      = ba + bi[row];
8147                 rmax2    = bimax[row];
8148                 nrow2    = bilen[row];
8149                 low2     = 0;
8150                 high2    = nrow2;
8151                 bm       = aij->B->rmap->n;
8152                 ba       = b->a;
8153                 inserted = PETSC_FALSE;
8154               }
8155             } else col = in[j];
8156             MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
8157           }
8158         }
8159       } else if (!aij->donotstash) {
8160         if (roworiented) {
8161           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v + i * n, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8162         } else {
8163           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v + i, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8164         }
8165       }
8166     }
8167     PetscCall(MatSeqAIJRestoreArray(A, &aa));
8168     PetscCall(MatSeqAIJRestoreArray(B, &ba));
8169   }
8170   PetscFunctionReturnVoid();
8171 }
8172 
8173 /* Undefining these here since they were redefined from their original definition above! No
8174  * other PETSc functions should be defined past this point, as it is impossible to recover the
8175  * original definitions */
8176 #undef PetscCall
8177 #undef SETERRQ
8178