xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 21604f62fddfd00a143407caac518db3de88a88a)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
2 #include <petsc/private/vecimpl.h>
3 #include <petsc/private/sfimpl.h>
4 #include <petsc/private/isimpl.h>
5 #include <petscblaslapack.h>
6 #include <petscsf.h>
7 #include <petsc/private/hashmapi.h>
8 
9 /* defines MatSetValues_MPI_Hash(), MatAssemblyBegin_MPI_Hash(), and MatAssemblyEnd_MPI_Hash() */
10 #define TYPE AIJ
11 #define TYPE_AIJ
12 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
13 #undef TYPE
14 #undef TYPE_AIJ
15 
16 static PetscErrorCode MatReset_MPIAIJ(Mat mat)
17 {
18   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
19 
20   PetscFunctionBegin;
21   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
22   PetscCall(MatStashDestroy_Private(&mat->stash));
23   PetscCall(VecDestroy(&aij->diag));
24   PetscCall(MatDestroy(&aij->A));
25   PetscCall(MatDestroy(&aij->B));
26 #if defined(PETSC_USE_CTABLE)
27   PetscCall(PetscHMapIDestroy(&aij->colmap));
28 #else
29   PetscCall(PetscFree(aij->colmap));
30 #endif
31   PetscCall(PetscFree(aij->garray));
32   PetscCall(VecDestroy(&aij->lvec));
33   PetscCall(VecScatterDestroy(&aij->Mvctx));
34   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
35   PetscCall(PetscFree(aij->ld));
36   PetscFunctionReturn(PETSC_SUCCESS);
37 }
38 
39 static PetscErrorCode MatResetHash_MPIAIJ(Mat mat)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
42   /* Save the nonzero states of the component matrices because those are what are used to determine
43     the nonzero state of mat */
44   PetscObjectState Astate = aij->A->nonzerostate, Bstate = aij->B->nonzerostate;
45 
46   PetscFunctionBegin;
47   PetscCall(MatReset_MPIAIJ(mat));
48   PetscCall(MatSetUp_MPI_Hash(mat));
49   aij->A->nonzerostate = ++Astate, aij->B->nonzerostate = ++Bstate;
50   PetscFunctionReturn(PETSC_SUCCESS);
51 }
52 
53 PetscErrorCode MatDestroy_MPIAIJ(Mat mat)
54 {
55   PetscFunctionBegin;
56   PetscCall(MatReset_MPIAIJ(mat));
57 
58   PetscCall(PetscFree(mat->data));
59 
60   /* may be created by MatCreateMPIAIJSumSeqAIJSymbolic */
61   PetscCall(PetscObjectCompose((PetscObject)mat, "MatMergeSeqsToMPI", NULL));
62 
63   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
64   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatStoreValues_C", NULL));
65   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatRetrieveValues_C", NULL));
66   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatIsTranspose_C", NULL));
67   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocation_C", NULL));
68   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetPreallocation_C", NULL));
69   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetHash_C", NULL));
70   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocationCSR_C", NULL));
71   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDiagonalScaleLocal_C", NULL));
72   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpibaij_C", NULL));
73   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisbaij_C", NULL));
74 #if defined(PETSC_HAVE_CUDA)
75   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcusparse_C", NULL));
76 #endif
77 #if defined(PETSC_HAVE_HIP)
78   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijhipsparse_C", NULL));
79 #endif
80 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
81   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijkokkos_C", NULL));
82 #endif
83   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
84 #if defined(PETSC_HAVE_ELEMENTAL)
85   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_elemental_C", NULL));
86 #endif
87 #if defined(PETSC_HAVE_SCALAPACK)
88   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_scalapack_C", NULL));
89 #endif
90 #if defined(PETSC_HAVE_HYPRE)
91   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_hypre_C", NULL));
92   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", NULL));
93 #endif
94   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
95   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_is_mpiaij_C", NULL));
96   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpiaij_C", NULL));
97   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetUseScalableIncreaseOverlap_C", NULL));
98   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijperm_C", NULL));
99   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijsell_C", NULL));
100 #if defined(PETSC_HAVE_MKL_SPARSE)
101   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijmkl_C", NULL));
102 #endif
103   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcrl_C", NULL));
104   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
105   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisell_C", NULL));
106   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetPreallocationCOO_C", NULL));
107   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetValuesCOO_C", NULL));
108   PetscFunctionReturn(PETSC_SUCCESS);
109 }
110 
111 static PetscErrorCode MatGetRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
112 {
113   Mat B;
114 
115   PetscFunctionBegin;
116   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &B));
117   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject)B));
118   PetscCall(MatGetRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
119   PetscCall(MatDestroy(&B));
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 static PetscErrorCode MatRestoreRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
124 {
125   Mat B;
126 
127   PetscFunctionBegin;
128   PetscCall(PetscObjectQuery((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject *)&B));
129   PetscCall(MatRestoreRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
130   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", NULL));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 /*MC
135    MATAIJ - MATAIJ = "aij" - A matrix type to be used for sparse matrices.
136 
137    This matrix type is identical to` MATSEQAIJ` when constructed with a single process communicator,
138    and `MATMPIAIJ` otherwise.  As a result, for single process communicators,
139   `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
140   for communicators controlling multiple processes.  It is recommended that you call both of
141   the above preallocation routines for simplicity.
142 
143    Options Database Key:
144 . -mat_type aij - sets the matrix type to `MATAIJ` during a call to `MatSetFromOptions()`
145 
146   Developer Note:
147   Level: beginner
148 
149     Subclasses include `MATAIJCUSPARSE`, `MATAIJPERM`, `MATAIJSELL`, `MATAIJMKL`, `MATAIJCRL`, `MATAIJKOKKOS`,and also automatically switches over to use inodes when
150    enough exist.
151 
152 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateAIJ()`, `MatCreateSeqAIJ()`, `MATSEQAIJ`, `MATMPIAIJ`
153 M*/
154 
155 /*MC
156    MATAIJCRL - MATAIJCRL = "aijcrl" - A matrix type to be used for sparse matrices.
157 
158    This matrix type is identical to `MATSEQAIJCRL` when constructed with a single process communicator,
159    and `MATMPIAIJCRL` otherwise.  As a result, for single process communicators,
160    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
161   for communicators controlling multiple processes.  It is recommended that you call both of
162   the above preallocation routines for simplicity.
163 
164    Options Database Key:
165 . -mat_type aijcrl - sets the matrix type to `MATMPIAIJCRL` during a call to `MatSetFromOptions()`
166 
167   Level: beginner
168 
169 .seealso: [](ch_matrices), `Mat`, `MatCreateMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`
170 M*/
171 
172 static PetscErrorCode MatBindToCPU_MPIAIJ(Mat A, PetscBool flg)
173 {
174   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
175 
176   PetscFunctionBegin;
177 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_VIENNACL)
178   A->boundtocpu = flg;
179 #endif
180   if (a->A) PetscCall(MatBindToCPU(a->A, flg));
181   if (a->B) PetscCall(MatBindToCPU(a->B, flg));
182 
183   /* In addition to binding the diagonal and off-diagonal matrices, bind the local vectors used for matrix-vector products.
184    * This maybe seems a little odd for a MatBindToCPU() call to do, but it makes no sense for the binding of these vectors
185    * to differ from the parent matrix. */
186   if (a->lvec) PetscCall(VecBindToCPU(a->lvec, flg));
187   if (a->diag) PetscCall(VecBindToCPU(a->diag, flg));
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSetBlockSizes_MPIAIJ(Mat M, PetscInt rbs, PetscInt cbs)
192 {
193   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)M->data;
194 
195   PetscFunctionBegin;
196   if (mat->A) {
197     PetscCall(MatSetBlockSizes(mat->A, rbs, cbs));
198     PetscCall(MatSetBlockSizes(mat->B, rbs, 1));
199   }
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
203 static PetscErrorCode MatFindNonzeroRows_MPIAIJ(Mat M, IS *keptrows)
204 {
205   Mat_MPIAIJ      *mat = (Mat_MPIAIJ *)M->data;
206   Mat_SeqAIJ      *a   = (Mat_SeqAIJ *)mat->A->data;
207   Mat_SeqAIJ      *b   = (Mat_SeqAIJ *)mat->B->data;
208   const PetscInt  *ia, *ib;
209   const MatScalar *aa, *bb, *aav, *bav;
210   PetscInt         na, nb, i, j, *rows, cnt = 0, n0rows;
211   PetscInt         m = M->rmap->n, rstart = M->rmap->rstart;
212 
213   PetscFunctionBegin;
214   *keptrows = NULL;
215 
216   ia = a->i;
217   ib = b->i;
218   PetscCall(MatSeqAIJGetArrayRead(mat->A, &aav));
219   PetscCall(MatSeqAIJGetArrayRead(mat->B, &bav));
220   for (i = 0; i < m; i++) {
221     na = ia[i + 1] - ia[i];
222     nb = ib[i + 1] - ib[i];
223     if (!na && !nb) {
224       cnt++;
225       goto ok1;
226     }
227     aa = aav + ia[i];
228     for (j = 0; j < na; j++) {
229       if (aa[j] != 0.0) goto ok1;
230     }
231     bb = PetscSafePointerPlusOffset(bav, ib[i]);
232     for (j = 0; j < nb; j++) {
233       if (bb[j] != 0.0) goto ok1;
234     }
235     cnt++;
236   ok1:;
237   }
238   PetscCallMPI(MPIU_Allreduce(&cnt, &n0rows, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)M)));
239   if (!n0rows) {
240     PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
241     PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
242     PetscFunctionReturn(PETSC_SUCCESS);
243   }
244   PetscCall(PetscMalloc1(M->rmap->n - cnt, &rows));
245   cnt = 0;
246   for (i = 0; i < m; i++) {
247     na = ia[i + 1] - ia[i];
248     nb = ib[i + 1] - ib[i];
249     if (!na && !nb) continue;
250     aa = aav + ia[i];
251     for (j = 0; j < na; j++) {
252       if (aa[j] != 0.0) {
253         rows[cnt++] = rstart + i;
254         goto ok2;
255       }
256     }
257     bb = PetscSafePointerPlusOffset(bav, ib[i]);
258     for (j = 0; j < nb; j++) {
259       if (bb[j] != 0.0) {
260         rows[cnt++] = rstart + i;
261         goto ok2;
262       }
263     }
264   ok2:;
265   }
266   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), cnt, rows, PETSC_OWN_POINTER, keptrows));
267   PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
268   PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
269   PetscFunctionReturn(PETSC_SUCCESS);
270 }
271 
272 static PetscErrorCode MatDiagonalSet_MPIAIJ(Mat Y, Vec D, InsertMode is)
273 {
274   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)Y->data;
275   PetscBool   cong;
276 
277   PetscFunctionBegin;
278   PetscCall(MatHasCongruentLayouts(Y, &cong));
279   if (Y->assembled && cong) {
280     PetscCall(MatDiagonalSet(aij->A, D, is));
281   } else {
282     PetscCall(MatDiagonalSet_Default(Y, D, is));
283   }
284   PetscFunctionReturn(PETSC_SUCCESS);
285 }
286 
287 static PetscErrorCode MatFindZeroDiagonals_MPIAIJ(Mat M, IS *zrows)
288 {
289   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)M->data;
290   PetscInt    i, rstart, nrows, *rows;
291 
292   PetscFunctionBegin;
293   *zrows = NULL;
294   PetscCall(MatFindZeroDiagonals_SeqAIJ_Private(aij->A, &nrows, &rows));
295   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
296   for (i = 0; i < nrows; i++) rows[i] += rstart;
297   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), nrows, rows, PETSC_OWN_POINTER, zrows));
298   PetscFunctionReturn(PETSC_SUCCESS);
299 }
300 
301 static PetscErrorCode MatGetColumnReductions_MPIAIJ(Mat A, PetscInt type, PetscReal *reductions)
302 {
303   Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)A->data;
304   PetscInt           i, m, n, *garray = aij->garray;
305   Mat_SeqAIJ        *a_aij = (Mat_SeqAIJ *)aij->A->data;
306   Mat_SeqAIJ        *b_aij = (Mat_SeqAIJ *)aij->B->data;
307   PetscReal         *work;
308   const PetscScalar *dummy;
309 
310   PetscFunctionBegin;
311   PetscCall(MatGetSize(A, &m, &n));
312   PetscCall(PetscCalloc1(n, &work));
313   PetscCall(MatSeqAIJGetArrayRead(aij->A, &dummy));
314   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &dummy));
315   PetscCall(MatSeqAIJGetArrayRead(aij->B, &dummy));
316   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &dummy));
317   if (type == NORM_2) {
318     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i] * a_aij->a[i]);
319     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i] * b_aij->a[i]);
320   } else if (type == NORM_1) {
321     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i]);
322     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i]);
323   } else if (type == NORM_INFINITY) {
324     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] = PetscMax(PetscAbsScalar(a_aij->a[i]), work[A->cmap->rstart + a_aij->j[i]]);
325     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] = PetscMax(PetscAbsScalar(b_aij->a[i]), work[garray[b_aij->j[i]]]);
326   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
327     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscRealPart(a_aij->a[i]);
328     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscRealPart(b_aij->a[i]);
329   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
330     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscImaginaryPart(a_aij->a[i]);
331     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscImaginaryPart(b_aij->a[i]);
332   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
333   if (type == NORM_INFINITY) {
334     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
335   } else {
336     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
337   }
338   PetscCall(PetscFree(work));
339   if (type == NORM_2) {
340     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
341   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
342     for (i = 0; i < n; i++) reductions[i] /= m;
343   }
344   PetscFunctionReturn(PETSC_SUCCESS);
345 }
346 
347 static PetscErrorCode MatFindOffBlockDiagonalEntries_MPIAIJ(Mat A, IS *is)
348 {
349   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
350   IS              sis, gis;
351   const PetscInt *isis, *igis;
352   PetscInt        n, *iis, nsis, ngis, rstart, i;
353 
354   PetscFunctionBegin;
355   PetscCall(MatFindOffBlockDiagonalEntries(a->A, &sis));
356   PetscCall(MatFindNonzeroRows(a->B, &gis));
357   PetscCall(ISGetSize(gis, &ngis));
358   PetscCall(ISGetSize(sis, &nsis));
359   PetscCall(ISGetIndices(sis, &isis));
360   PetscCall(ISGetIndices(gis, &igis));
361 
362   PetscCall(PetscMalloc1(ngis + nsis, &iis));
363   PetscCall(PetscArraycpy(iis, igis, ngis));
364   PetscCall(PetscArraycpy(iis + ngis, isis, nsis));
365   n = ngis + nsis;
366   PetscCall(PetscSortRemoveDupsInt(&n, iis));
367   PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
368   for (i = 0; i < n; i++) iis[i] += rstart;
369   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)A), n, iis, PETSC_OWN_POINTER, is));
370 
371   PetscCall(ISRestoreIndices(sis, &isis));
372   PetscCall(ISRestoreIndices(gis, &igis));
373   PetscCall(ISDestroy(&sis));
374   PetscCall(ISDestroy(&gis));
375   PetscFunctionReturn(PETSC_SUCCESS);
376 }
377 
378 /*
379   Local utility routine that creates a mapping from the global column
380 number to the local number in the off-diagonal part of the local
381 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
382 a slightly higher hash table cost; without it it is not scalable (each processor
383 has an order N integer array but is fast to access.
384 */
385 PetscErrorCode MatCreateColmap_MPIAIJ_Private(Mat mat)
386 {
387   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
388   PetscInt    n   = aij->B->cmap->n, i;
389 
390   PetscFunctionBegin;
391   PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
392 #if defined(PETSC_USE_CTABLE)
393   PetscCall(PetscHMapICreateWithSize(n, &aij->colmap));
394   for (i = 0; i < n; i++) PetscCall(PetscHMapISet(aij->colmap, aij->garray[i] + 1, i + 1));
395 #else
396   PetscCall(PetscCalloc1(mat->cmap->N + 1, &aij->colmap));
397   for (i = 0; i < n; i++) aij->colmap[aij->garray[i]] = i + 1;
398 #endif
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv, orow, ocol) \
403   do { \
404     if (col <= lastcol1) low1 = 0; \
405     else high1 = nrow1; \
406     lastcol1 = col; \
407     while (high1 - low1 > 5) { \
408       t = (low1 + high1) / 2; \
409       if (rp1[t] > col) high1 = t; \
410       else low1 = t; \
411     } \
412     for (_i = low1; _i < high1; _i++) { \
413       if (rp1[_i] > col) break; \
414       if (rp1[_i] == col) { \
415         if (addv == ADD_VALUES) { \
416           ap1[_i] += value; \
417           /* Not sure LogFlops will slow dow the code or not */ \
418           (void)PetscLogFlops(1.0); \
419         } else ap1[_i] = value; \
420         goto a_noinsert; \
421       } \
422     } \
423     if (value == 0.0 && ignorezeroentries && row != col) { \
424       low1  = 0; \
425       high1 = nrow1; \
426       goto a_noinsert; \
427     } \
428     if (nonew == 1) { \
429       low1  = 0; \
430       high1 = nrow1; \
431       goto a_noinsert; \
432     } \
433     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
434     MatSeqXAIJReallocateAIJ(A, am, 1, nrow1, row, col, rmax1, aa, ai, aj, rp1, ap1, aimax, nonew, MatScalar); \
435     N = nrow1++ - 1; \
436     a->nz++; \
437     high1++; \
438     /* shift up all the later entries in this row */ \
439     PetscCall(PetscArraymove(rp1 + _i + 1, rp1 + _i, N - _i + 1)); \
440     PetscCall(PetscArraymove(ap1 + _i + 1, ap1 + _i, N - _i + 1)); \
441     rp1[_i] = col; \
442     ap1[_i] = value; \
443   a_noinsert:; \
444     ailen[row] = nrow1; \
445   } while (0)
446 
447 #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv, orow, ocol) \
448   do { \
449     if (col <= lastcol2) low2 = 0; \
450     else high2 = nrow2; \
451     lastcol2 = col; \
452     while (high2 - low2 > 5) { \
453       t = (low2 + high2) / 2; \
454       if (rp2[t] > col) high2 = t; \
455       else low2 = t; \
456     } \
457     for (_i = low2; _i < high2; _i++) { \
458       if (rp2[_i] > col) break; \
459       if (rp2[_i] == col) { \
460         if (addv == ADD_VALUES) { \
461           ap2[_i] += value; \
462           (void)PetscLogFlops(1.0); \
463         } else ap2[_i] = value; \
464         goto b_noinsert; \
465       } \
466     } \
467     if (value == 0.0 && ignorezeroentries) { \
468       low2  = 0; \
469       high2 = nrow2; \
470       goto b_noinsert; \
471     } \
472     if (nonew == 1) { \
473       low2  = 0; \
474       high2 = nrow2; \
475       goto b_noinsert; \
476     } \
477     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
478     MatSeqXAIJReallocateAIJ(B, bm, 1, nrow2, row, col, rmax2, ba, bi, bj, rp2, ap2, bimax, nonew, MatScalar); \
479     N = nrow2++ - 1; \
480     b->nz++; \
481     high2++; \
482     /* shift up all the later entries in this row */ \
483     PetscCall(PetscArraymove(rp2 + _i + 1, rp2 + _i, N - _i + 1)); \
484     PetscCall(PetscArraymove(ap2 + _i + 1, ap2 + _i, N - _i + 1)); \
485     rp2[_i] = col; \
486     ap2[_i] = value; \
487   b_noinsert:; \
488     bilen[row] = nrow2; \
489   } while (0)
490 
491 static PetscErrorCode MatSetValuesRow_MPIAIJ(Mat A, PetscInt row, const PetscScalar v[])
492 {
493   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)A->data;
494   Mat_SeqAIJ  *a = (Mat_SeqAIJ *)mat->A->data, *b = (Mat_SeqAIJ *)mat->B->data;
495   PetscInt     l, *garray                         = mat->garray, diag;
496   PetscScalar *aa, *ba;
497 
498   PetscFunctionBegin;
499   /* code only works for square matrices A */
500 
501   /* find size of row to the left of the diagonal part */
502   PetscCall(MatGetOwnershipRange(A, &diag, NULL));
503   row = row - diag;
504   for (l = 0; l < b->i[row + 1] - b->i[row]; l++) {
505     if (garray[b->j[b->i[row] + l]] > diag) break;
506   }
507   if (l) {
508     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
509     PetscCall(PetscArraycpy(ba + b->i[row], v, l));
510     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
511   }
512 
513   /* diagonal part */
514   if (a->i[row + 1] - a->i[row]) {
515     PetscCall(MatSeqAIJGetArray(mat->A, &aa));
516     PetscCall(PetscArraycpy(aa + a->i[row], v + l, a->i[row + 1] - a->i[row]));
517     PetscCall(MatSeqAIJRestoreArray(mat->A, &aa));
518   }
519 
520   /* right of diagonal part */
521   if (b->i[row + 1] - b->i[row] - l) {
522     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
523     PetscCall(PetscArraycpy(ba + b->i[row] + l, v + l + a->i[row + 1] - a->i[row], b->i[row + 1] - b->i[row] - l));
524     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
525   }
526   PetscFunctionReturn(PETSC_SUCCESS);
527 }
528 
529 PetscErrorCode MatSetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode addv)
530 {
531   Mat_MPIAIJ *aij   = (Mat_MPIAIJ *)mat->data;
532   PetscScalar value = 0.0;
533   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
534   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
535   PetscBool   roworiented = aij->roworiented;
536 
537   /* Some Variables required in the macro */
538   Mat         A     = aij->A;
539   Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
540   PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
541   PetscBool   ignorezeroentries = a->ignorezeroentries;
542   Mat         B                 = aij->B;
543   Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
544   PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
545   MatScalar  *aa, *ba;
546   PetscInt   *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
547   PetscInt    nonew;
548   MatScalar  *ap1, *ap2;
549 
550   PetscFunctionBegin;
551   PetscCall(MatSeqAIJGetArray(A, &aa));
552   PetscCall(MatSeqAIJGetArray(B, &ba));
553   for (i = 0; i < m; i++) {
554     if (im[i] < 0) continue;
555     PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
556     if (im[i] >= rstart && im[i] < rend) {
557       row      = im[i] - rstart;
558       lastcol1 = -1;
559       rp1      = PetscSafePointerPlusOffset(aj, ai[row]);
560       ap1      = PetscSafePointerPlusOffset(aa, ai[row]);
561       rmax1    = aimax[row];
562       nrow1    = ailen[row];
563       low1     = 0;
564       high1    = nrow1;
565       lastcol2 = -1;
566       rp2      = PetscSafePointerPlusOffset(bj, bi[row]);
567       ap2      = PetscSafePointerPlusOffset(ba, bi[row]);
568       rmax2    = bimax[row];
569       nrow2    = bilen[row];
570       low2     = 0;
571       high2    = nrow2;
572 
573       for (j = 0; j < n; j++) {
574         if (v) value = roworiented ? v[i * n + j] : v[i + j * m];
575         if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
576         if (in[j] >= cstart && in[j] < cend) {
577           col   = in[j] - cstart;
578           nonew = a->nonew;
579           MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
580         } else if (in[j] < 0) {
581           continue;
582         } else {
583           PetscCheck(in[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
584           if (mat->was_assembled) {
585             if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
586 #if defined(PETSC_USE_CTABLE)
587             PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col)); /* map global col ids to local ones */
588             col--;
589 #else
590             col = aij->colmap[in[j]] - 1;
591 #endif
592             if (col < 0 && !((Mat_SeqAIJ *)aij->B->data)->nonew) { /* col < 0 means in[j] is a new col for B */
593               PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));  /* Change aij->B from reduced/local format to expanded/global format */
594               col = in[j];
595               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
596               B     = aij->B;
597               b     = (Mat_SeqAIJ *)B->data;
598               bimax = b->imax;
599               bi    = b->i;
600               bilen = b->ilen;
601               bj    = b->j;
602               ba    = b->a;
603               rp2   = PetscSafePointerPlusOffset(bj, bi[row]);
604               ap2   = PetscSafePointerPlusOffset(ba, bi[row]);
605               rmax2 = bimax[row];
606               nrow2 = bilen[row];
607               low2  = 0;
608               high2 = nrow2;
609               bm    = aij->B->rmap->n;
610               ba    = b->a;
611             } else if (col < 0 && !(ignorezeroentries && value == 0.0)) {
612               if (1 == ((Mat_SeqAIJ *)aij->B->data)->nonew) {
613                 PetscCall(PetscInfo(mat, "Skipping of insertion of new nonzero location in off-diagonal portion of matrix %g(%" PetscInt_FMT ",%" PetscInt_FMT ")\n", (double)PetscRealPart(value), im[i], in[j]));
614               } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", im[i], in[j]);
615             }
616           } else col = in[j];
617           nonew = b->nonew;
618           MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
619         }
620       }
621     } else {
622       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Setting off process row %" PetscInt_FMT " even though MatSetOption(,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) was set", im[i]);
623       if (!aij->donotstash) {
624         mat->assembled = PETSC_FALSE;
625         if (roworiented) {
626           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i * n), (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
627         } else {
628           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i), m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
629         }
630       }
631     }
632   }
633   PetscCall(MatSeqAIJRestoreArray(A, &aa)); /* aa, bb might have been free'd due to reallocation above. But we don't access them here */
634   PetscCall(MatSeqAIJRestoreArray(B, &ba));
635   PetscFunctionReturn(PETSC_SUCCESS);
636 }
637 
638 /*
639     This function sets the j and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
640     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
641     No off-processor parts off the matrix are allowed here and mat->was_assembled has to be PETSC_FALSE.
642 */
643 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[])
644 {
645   Mat_MPIAIJ *aij    = (Mat_MPIAIJ *)mat->data;
646   Mat         A      = aij->A; /* diagonal part of the matrix */
647   Mat         B      = aij->B; /* off-diagonal part of the matrix */
648   Mat_SeqAIJ *a      = (Mat_SeqAIJ *)A->data;
649   Mat_SeqAIJ *b      = (Mat_SeqAIJ *)B->data;
650   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, col;
651   PetscInt   *ailen = a->ilen, *aj = a->j;
652   PetscInt   *bilen = b->ilen, *bj = b->j;
653   PetscInt    am          = aij->A->rmap->n, j;
654   PetscInt    diag_so_far = 0, dnz;
655   PetscInt    offd_so_far = 0, onz;
656 
657   PetscFunctionBegin;
658   /* Iterate over all rows of the matrix */
659   for (j = 0; j < am; j++) {
660     dnz = onz = 0;
661     /*  Iterate over all non-zero columns of the current row */
662     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
663       /* If column is in the diagonal */
664       if (mat_j[col] >= cstart && mat_j[col] < cend) {
665         aj[diag_so_far++] = mat_j[col] - cstart;
666         dnz++;
667       } else { /* off-diagonal entries */
668         bj[offd_so_far++] = mat_j[col];
669         onz++;
670       }
671     }
672     ailen[j] = dnz;
673     bilen[j] = onz;
674   }
675   PetscFunctionReturn(PETSC_SUCCESS);
676 }
677 
678 /*
679     This function sets the local j, a and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
680     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
681     No off-processor parts off the matrix are allowed here, they are set at a later point by MatSetValues_MPIAIJ.
682     Also, mat->was_assembled has to be false, otherwise the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart;
683     would not be true and the more complex MatSetValues_MPIAIJ has to be used.
684 */
685 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[], const PetscScalar mat_a[])
686 {
687   Mat_MPIAIJ  *aij  = (Mat_MPIAIJ *)mat->data;
688   Mat          A    = aij->A; /* diagonal part of the matrix */
689   Mat          B    = aij->B; /* off-diagonal part of the matrix */
690   Mat_SeqAIJ  *aijd = (Mat_SeqAIJ *)aij->A->data, *aijo = (Mat_SeqAIJ *)aij->B->data;
691   Mat_SeqAIJ  *a      = (Mat_SeqAIJ *)A->data;
692   Mat_SeqAIJ  *b      = (Mat_SeqAIJ *)B->data;
693   PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend;
694   PetscInt    *ailen = a->ilen, *aj = a->j;
695   PetscInt    *bilen = b->ilen, *bj = b->j;
696   PetscInt     am          = aij->A->rmap->n, j;
697   PetscInt    *full_diag_i = aijd->i, *full_offd_i = aijo->i; /* These variables can also include non-local elements, which are set at a later point. */
698   PetscInt     col, dnz_row, onz_row, rowstart_diag, rowstart_offd;
699   PetscScalar *aa = a->a, *ba = b->a;
700 
701   PetscFunctionBegin;
702   /* Iterate over all rows of the matrix */
703   for (j = 0; j < am; j++) {
704     dnz_row = onz_row = 0;
705     rowstart_offd     = full_offd_i[j];
706     rowstart_diag     = full_diag_i[j];
707     /*  Iterate over all non-zero columns of the current row */
708     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
709       /* If column is in the diagonal */
710       if (mat_j[col] >= cstart && mat_j[col] < cend) {
711         aj[rowstart_diag + dnz_row] = mat_j[col] - cstart;
712         aa[rowstart_diag + dnz_row] = mat_a[col];
713         dnz_row++;
714       } else { /* off-diagonal entries */
715         bj[rowstart_offd + onz_row] = mat_j[col];
716         ba[rowstart_offd + onz_row] = mat_a[col];
717         onz_row++;
718       }
719     }
720     ailen[j] = dnz_row;
721     bilen[j] = onz_row;
722   }
723   PetscFunctionReturn(PETSC_SUCCESS);
724 }
725 
726 static PetscErrorCode MatGetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
727 {
728   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
729   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
730   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
731 
732   PetscFunctionBegin;
733   for (i = 0; i < m; i++) {
734     if (idxm[i] < 0) continue; /* negative row */
735     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, idxm[i], mat->rmap->N - 1);
736     PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported, row requested %" PetscInt_FMT " range [%" PetscInt_FMT " %" PetscInt_FMT ")", idxm[i], rstart, rend);
737     row = idxm[i] - rstart;
738     for (j = 0; j < n; j++) {
739       if (idxn[j] < 0) continue; /* negative column */
740       PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, idxn[j], mat->cmap->N - 1);
741       if (idxn[j] >= cstart && idxn[j] < cend) {
742         col = idxn[j] - cstart;
743         PetscCall(MatGetValues(aij->A, 1, &row, 1, &col, v + i * n + j));
744       } else {
745         if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
746 #if defined(PETSC_USE_CTABLE)
747         PetscCall(PetscHMapIGetWithDefault(aij->colmap, idxn[j] + 1, 0, &col));
748         col--;
749 #else
750         col = aij->colmap[idxn[j]] - 1;
751 #endif
752         if ((col < 0) || (aij->garray[col] != idxn[j])) *(v + i * n + j) = 0.0;
753         else PetscCall(MatGetValues(aij->B, 1, &row, 1, &col, v + i * n + j));
754       }
755     }
756   }
757   PetscFunctionReturn(PETSC_SUCCESS);
758 }
759 
760 static PetscErrorCode MatAssemblyBegin_MPIAIJ(Mat mat, MatAssemblyType mode)
761 {
762   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
763   PetscInt    nstash, reallocs;
764 
765   PetscFunctionBegin;
766   if (aij->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
767 
768   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
769   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
770   PetscCall(PetscInfo(aij->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
771   PetscFunctionReturn(PETSC_SUCCESS);
772 }
773 
774 PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat mat, MatAssemblyType mode)
775 {
776   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *)mat->data;
777   PetscMPIInt  n;
778   PetscInt     i, j, rstart, ncols, flg;
779   PetscInt    *row, *col;
780   PetscBool    other_disassembled;
781   PetscScalar *val;
782 
783   /* do not use 'b = (Mat_SeqAIJ*)aij->B->data' as B can be reset in disassembly */
784 
785   PetscFunctionBegin;
786   if (!aij->donotstash && !mat->nooffprocentries) {
787     while (1) {
788       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
789       if (!flg) break;
790 
791       for (i = 0; i < n;) {
792         /* Now identify the consecutive vals belonging to the same row */
793         for (j = i, rstart = row[j]; j < n; j++) {
794           if (row[j] != rstart) break;
795         }
796         if (j < n) ncols = j - i;
797         else ncols = n - i;
798         /* Now assemble all these values with a single function call */
799         PetscCall(MatSetValues_MPIAIJ(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
800         i = j;
801       }
802     }
803     PetscCall(MatStashScatterEnd_Private(&mat->stash));
804   }
805 #if defined(PETSC_HAVE_DEVICE)
806   if (mat->offloadmask == PETSC_OFFLOAD_CPU) aij->A->offloadmask = PETSC_OFFLOAD_CPU;
807   /* We call MatBindToCPU() on aij->A and aij->B here, because if MatBindToCPU_MPIAIJ() is called before assembly, it cannot bind these. */
808   if (mat->boundtocpu) {
809     PetscCall(MatBindToCPU(aij->A, PETSC_TRUE));
810     PetscCall(MatBindToCPU(aij->B, PETSC_TRUE));
811   }
812 #endif
813   PetscCall(MatAssemblyBegin(aij->A, mode));
814   PetscCall(MatAssemblyEnd(aij->A, mode));
815 
816   /* determine if any processor has disassembled, if so we must
817      also disassemble ourself, in order that we may reassemble. */
818   /*
819      if nonzero structure of submatrix B cannot change then we know that
820      no processor disassembled thus we can skip this stuff
821   */
822   if (!((Mat_SeqAIJ *)aij->B->data)->nonew) {
823     PetscCallMPI(MPIU_Allreduce(&mat->was_assembled, &other_disassembled, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
824     if (mat->was_assembled && !other_disassembled) { /* mat on this rank has reduced off-diag B with local col ids, but globally it does not */
825       PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
826     }
827   }
828   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(MatSetUpMultiply_MPIAIJ(mat));
829   PetscCall(MatSetOption(aij->B, MAT_USE_INODES, PETSC_FALSE));
830 #if defined(PETSC_HAVE_DEVICE)
831   if (mat->offloadmask == PETSC_OFFLOAD_CPU && aij->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) aij->B->offloadmask = PETSC_OFFLOAD_CPU;
832 #endif
833   PetscCall(MatAssemblyBegin(aij->B, mode));
834   PetscCall(MatAssemblyEnd(aij->B, mode));
835 
836   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
837 
838   aij->rowvalues = NULL;
839 
840   PetscCall(VecDestroy(&aij->diag));
841 
842   /* if no new nonzero locations are allowed in matrix then only set the matrix state the first time through */
843   if ((!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) || !((Mat_SeqAIJ *)aij->A->data)->nonew) {
844     PetscObjectState state = aij->A->nonzerostate + aij->B->nonzerostate;
845     PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
846   }
847 #if defined(PETSC_HAVE_DEVICE)
848   mat->offloadmask = PETSC_OFFLOAD_BOTH;
849 #endif
850   PetscFunctionReturn(PETSC_SUCCESS);
851 }
852 
853 static PetscErrorCode MatZeroEntries_MPIAIJ(Mat A)
854 {
855   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
856 
857   PetscFunctionBegin;
858   PetscCall(MatZeroEntries(l->A));
859   PetscCall(MatZeroEntries(l->B));
860   PetscFunctionReturn(PETSC_SUCCESS);
861 }
862 
863 static PetscErrorCode MatZeroRows_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
864 {
865   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)A->data;
866   PetscInt   *lrows;
867   PetscInt    r, len;
868   PetscBool   cong;
869 
870   PetscFunctionBegin;
871   /* get locally owned rows */
872   PetscCall(MatZeroRowsMapLocal_Private(A, N, rows, &len, &lrows));
873   PetscCall(MatHasCongruentLayouts(A, &cong));
874   /* fix right-hand side if needed */
875   if (x && b) {
876     const PetscScalar *xx;
877     PetscScalar       *bb;
878 
879     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
880     PetscCall(VecGetArrayRead(x, &xx));
881     PetscCall(VecGetArray(b, &bb));
882     for (r = 0; r < len; ++r) bb[lrows[r]] = diag * xx[lrows[r]];
883     PetscCall(VecRestoreArrayRead(x, &xx));
884     PetscCall(VecRestoreArray(b, &bb));
885   }
886 
887   if (diag != 0.0 && cong) {
888     PetscCall(MatZeroRows(mat->A, len, lrows, diag, NULL, NULL));
889     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
890   } else if (diag != 0.0) { /* non-square or non congruent layouts -> if keepnonzeropattern is false, we allow for new insertion */
891     Mat_SeqAIJ *aijA = (Mat_SeqAIJ *)mat->A->data;
892     Mat_SeqAIJ *aijB = (Mat_SeqAIJ *)mat->B->data;
893     PetscInt    nnwA, nnwB;
894     PetscBool   nnzA, nnzB;
895 
896     nnwA = aijA->nonew;
897     nnwB = aijB->nonew;
898     nnzA = aijA->keepnonzeropattern;
899     nnzB = aijB->keepnonzeropattern;
900     if (!nnzA) {
901       PetscCall(PetscInfo(mat->A, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on diagonal block.\n"));
902       aijA->nonew = 0;
903     }
904     if (!nnzB) {
905       PetscCall(PetscInfo(mat->B, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on off-diagonal block.\n"));
906       aijB->nonew = 0;
907     }
908     /* Must zero here before the next loop */
909     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
910     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
911     for (r = 0; r < len; ++r) {
912       const PetscInt row = lrows[r] + A->rmap->rstart;
913       if (row >= A->cmap->N) continue;
914       PetscCall(MatSetValues(A, 1, &row, 1, &row, &diag, INSERT_VALUES));
915     }
916     aijA->nonew = nnwA;
917     aijB->nonew = nnwB;
918   } else {
919     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
920     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
921   }
922   PetscCall(PetscFree(lrows));
923   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
924   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
925 
926   /* only change matrix nonzero state if pattern was allowed to be changed */
927   if (!((Mat_SeqAIJ *)mat->A->data)->keepnonzeropattern || !((Mat_SeqAIJ *)mat->A->data)->nonew) {
928     PetscObjectState state = mat->A->nonzerostate + mat->B->nonzerostate;
929     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
930   }
931   PetscFunctionReturn(PETSC_SUCCESS);
932 }
933 
934 static PetscErrorCode MatZeroRowsColumns_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
935 {
936   Mat_MPIAIJ        *l = (Mat_MPIAIJ *)A->data;
937   PetscInt           n = A->rmap->n;
938   PetscInt           i, j, r, m, len = 0;
939   PetscInt          *lrows, *owners = A->rmap->range;
940   PetscMPIInt        p = 0;
941   PetscSFNode       *rrows;
942   PetscSF            sf;
943   const PetscScalar *xx;
944   PetscScalar       *bb, *mask, *aij_a;
945   Vec                xmask, lmask;
946   Mat_SeqAIJ        *aij = (Mat_SeqAIJ *)l->B->data;
947   const PetscInt    *aj, *ii, *ridx;
948   PetscScalar       *aa;
949 
950   PetscFunctionBegin;
951   /* Create SF where leaves are input rows and roots are owned rows */
952   PetscCall(PetscMalloc1(n, &lrows));
953   for (r = 0; r < n; ++r) lrows[r] = -1;
954   PetscCall(PetscMalloc1(N, &rrows));
955   for (r = 0; r < N; ++r) {
956     const PetscInt idx = rows[r];
957     PetscCheck(idx >= 0 && A->rmap->N > idx, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range [0,%" PetscInt_FMT ")", idx, A->rmap->N);
958     if (idx < owners[p] || owners[p + 1] <= idx) { /* short-circuit the search if the last p owns this row too */
959       PetscCall(PetscLayoutFindOwner(A->rmap, idx, &p));
960     }
961     rrows[r].rank  = p;
962     rrows[r].index = rows[r] - owners[p];
963   }
964   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
965   PetscCall(PetscSFSetGraph(sf, n, N, NULL, PETSC_OWN_POINTER, rrows, PETSC_OWN_POINTER));
966   /* Collect flags for rows to be zeroed */
967   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
968   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
969   PetscCall(PetscSFDestroy(&sf));
970   /* Compress and put in row numbers */
971   for (r = 0; r < n; ++r)
972     if (lrows[r] >= 0) lrows[len++] = r;
973   /* zero diagonal part of matrix */
974   PetscCall(MatZeroRowsColumns(l->A, len, lrows, diag, x, b));
975   /* handle off-diagonal part of matrix */
976   PetscCall(MatCreateVecs(A, &xmask, NULL));
977   PetscCall(VecDuplicate(l->lvec, &lmask));
978   PetscCall(VecGetArray(xmask, &bb));
979   for (i = 0; i < len; i++) bb[lrows[i]] = 1;
980   PetscCall(VecRestoreArray(xmask, &bb));
981   PetscCall(VecScatterBegin(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
982   PetscCall(VecScatterEnd(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
983   PetscCall(VecDestroy(&xmask));
984   if (x && b) { /* this code is buggy when the row and column layout don't match */
985     PetscBool cong;
986 
987     PetscCall(MatHasCongruentLayouts(A, &cong));
988     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
989     PetscCall(VecScatterBegin(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
990     PetscCall(VecScatterEnd(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
991     PetscCall(VecGetArrayRead(l->lvec, &xx));
992     PetscCall(VecGetArray(b, &bb));
993   }
994   PetscCall(VecGetArray(lmask, &mask));
995   /* remove zeroed rows of off-diagonal matrix */
996   PetscCall(MatSeqAIJGetArray(l->B, &aij_a));
997   ii = aij->i;
998   for (i = 0; i < len; i++) PetscCall(PetscArrayzero(PetscSafePointerPlusOffset(aij_a, ii[lrows[i]]), ii[lrows[i] + 1] - ii[lrows[i]]));
999   /* loop over all elements of off process part of matrix zeroing removed columns*/
1000   if (aij->compressedrow.use) {
1001     m    = aij->compressedrow.nrows;
1002     ii   = aij->compressedrow.i;
1003     ridx = aij->compressedrow.rindex;
1004     for (i = 0; i < m; i++) {
1005       n  = ii[i + 1] - ii[i];
1006       aj = aij->j + ii[i];
1007       aa = aij_a + ii[i];
1008 
1009       for (j = 0; j < n; j++) {
1010         if (PetscAbsScalar(mask[*aj])) {
1011           if (b) bb[*ridx] -= *aa * xx[*aj];
1012           *aa = 0.0;
1013         }
1014         aa++;
1015         aj++;
1016       }
1017       ridx++;
1018     }
1019   } else { /* do not use compressed row format */
1020     m = l->B->rmap->n;
1021     for (i = 0; i < m; i++) {
1022       n  = ii[i + 1] - ii[i];
1023       aj = aij->j + ii[i];
1024       aa = aij_a + ii[i];
1025       for (j = 0; j < n; j++) {
1026         if (PetscAbsScalar(mask[*aj])) {
1027           if (b) bb[i] -= *aa * xx[*aj];
1028           *aa = 0.0;
1029         }
1030         aa++;
1031         aj++;
1032       }
1033     }
1034   }
1035   if (x && b) {
1036     PetscCall(VecRestoreArray(b, &bb));
1037     PetscCall(VecRestoreArrayRead(l->lvec, &xx));
1038   }
1039   PetscCall(MatSeqAIJRestoreArray(l->B, &aij_a));
1040   PetscCall(VecRestoreArray(lmask, &mask));
1041   PetscCall(VecDestroy(&lmask));
1042   PetscCall(PetscFree(lrows));
1043 
1044   /* only change matrix nonzero state if pattern was allowed to be changed */
1045   if (!((Mat_SeqAIJ *)l->A->data)->nonew) {
1046     PetscObjectState state = l->A->nonzerostate + l->B->nonzerostate;
1047     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
1048   }
1049   PetscFunctionReturn(PETSC_SUCCESS);
1050 }
1051 
1052 static PetscErrorCode MatMult_MPIAIJ(Mat A, Vec xx, Vec yy)
1053 {
1054   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1055   PetscInt    nt;
1056   VecScatter  Mvctx = a->Mvctx;
1057 
1058   PetscFunctionBegin;
1059   PetscCall(VecGetLocalSize(xx, &nt));
1060   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
1061   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1062   PetscUseTypeMethod(a->A, mult, xx, yy);
1063   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1064   PetscUseTypeMethod(a->B, multadd, a->lvec, yy, yy);
1065   PetscFunctionReturn(PETSC_SUCCESS);
1066 }
1067 
1068 static PetscErrorCode MatMultDiagonalBlock_MPIAIJ(Mat A, Vec bb, Vec xx)
1069 {
1070   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1071 
1072   PetscFunctionBegin;
1073   PetscCall(MatMultDiagonalBlock(a->A, bb, xx));
1074   PetscFunctionReturn(PETSC_SUCCESS);
1075 }
1076 
1077 static PetscErrorCode MatMultAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1078 {
1079   Mat_MPIAIJ *a     = (Mat_MPIAIJ *)A->data;
1080   VecScatter  Mvctx = a->Mvctx;
1081 
1082   PetscFunctionBegin;
1083   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1084   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
1085   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1086   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
1087   PetscFunctionReturn(PETSC_SUCCESS);
1088 }
1089 
1090 static PetscErrorCode MatMultTranspose_MPIAIJ(Mat A, Vec xx, Vec yy)
1091 {
1092   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1093 
1094   PetscFunctionBegin;
1095   /* do nondiagonal part */
1096   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1097   /* do local part */
1098   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
1099   /* add partial results together */
1100   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1101   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1102   PetscFunctionReturn(PETSC_SUCCESS);
1103 }
1104 
1105 static PetscErrorCode MatIsTranspose_MPIAIJ(Mat Amat, Mat Bmat, PetscReal tol, PetscBool *f)
1106 {
1107   MPI_Comm    comm;
1108   Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)Amat->data, *Bij = (Mat_MPIAIJ *)Bmat->data;
1109   Mat         Adia = Aij->A, Bdia = Bij->A, Aoff, Boff, *Aoffs, *Boffs;
1110   IS          Me, Notme;
1111   PetscInt    M, N, first, last, *notme, i;
1112   PetscBool   lf;
1113   PetscMPIInt size;
1114 
1115   PetscFunctionBegin;
1116   /* Easy test: symmetric diagonal block */
1117   PetscCall(MatIsTranspose(Adia, Bdia, tol, &lf));
1118   PetscCallMPI(MPIU_Allreduce(&lf, f, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)Amat)));
1119   if (!*f) PetscFunctionReturn(PETSC_SUCCESS);
1120   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
1121   PetscCallMPI(MPI_Comm_size(comm, &size));
1122   if (size == 1) PetscFunctionReturn(PETSC_SUCCESS);
1123 
1124   /* Hard test: off-diagonal block. This takes a MatCreateSubMatrix. */
1125   PetscCall(MatGetSize(Amat, &M, &N));
1126   PetscCall(MatGetOwnershipRange(Amat, &first, &last));
1127   PetscCall(PetscMalloc1(N - last + first, &notme));
1128   for (i = 0; i < first; i++) notme[i] = i;
1129   for (i = last; i < M; i++) notme[i - last + first] = i;
1130   PetscCall(ISCreateGeneral(MPI_COMM_SELF, N - last + first, notme, PETSC_COPY_VALUES, &Notme));
1131   PetscCall(ISCreateStride(MPI_COMM_SELF, last - first, first, 1, &Me));
1132   PetscCall(MatCreateSubMatrices(Amat, 1, &Me, &Notme, MAT_INITIAL_MATRIX, &Aoffs));
1133   Aoff = Aoffs[0];
1134   PetscCall(MatCreateSubMatrices(Bmat, 1, &Notme, &Me, MAT_INITIAL_MATRIX, &Boffs));
1135   Boff = Boffs[0];
1136   PetscCall(MatIsTranspose(Aoff, Boff, tol, f));
1137   PetscCall(MatDestroyMatrices(1, &Aoffs));
1138   PetscCall(MatDestroyMatrices(1, &Boffs));
1139   PetscCall(ISDestroy(&Me));
1140   PetscCall(ISDestroy(&Notme));
1141   PetscCall(PetscFree(notme));
1142   PetscFunctionReturn(PETSC_SUCCESS);
1143 }
1144 
1145 static PetscErrorCode MatMultTransposeAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1146 {
1147   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1148 
1149   PetscFunctionBegin;
1150   /* do nondiagonal part */
1151   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1152   /* do local part */
1153   PetscCall((*a->A->ops->multtransposeadd)(a->A, xx, yy, zz));
1154   /* add partial results together */
1155   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1156   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1157   PetscFunctionReturn(PETSC_SUCCESS);
1158 }
1159 
1160 /*
1161   This only works correctly for square matrices where the subblock A->A is the
1162    diagonal block
1163 */
1164 static PetscErrorCode MatGetDiagonal_MPIAIJ(Mat A, Vec v)
1165 {
1166   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1167 
1168   PetscFunctionBegin;
1169   PetscCheck(A->rmap->N == A->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Supports only square matrix where A->A is diag block");
1170   PetscCheck(A->rmap->rstart == A->cmap->rstart && A->rmap->rend == A->cmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "row partition must equal col partition");
1171   PetscCall(MatGetDiagonal(a->A, v));
1172   PetscFunctionReturn(PETSC_SUCCESS);
1173 }
1174 
1175 static PetscErrorCode MatScale_MPIAIJ(Mat A, PetscScalar aa)
1176 {
1177   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1178 
1179   PetscFunctionBegin;
1180   PetscCall(MatScale(a->A, aa));
1181   PetscCall(MatScale(a->B, aa));
1182   PetscFunctionReturn(PETSC_SUCCESS);
1183 }
1184 
1185 static PetscErrorCode MatView_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
1186 {
1187   Mat_MPIAIJ        *aij    = (Mat_MPIAIJ *)mat->data;
1188   Mat_SeqAIJ        *A      = (Mat_SeqAIJ *)aij->A->data;
1189   Mat_SeqAIJ        *B      = (Mat_SeqAIJ *)aij->B->data;
1190   const PetscInt    *garray = aij->garray;
1191   const PetscScalar *aa, *ba;
1192   PetscInt           header[4], M, N, m, rs, cs, cnt, i, ja, jb;
1193   PetscInt64         nz, hnz;
1194   PetscInt          *rowlens;
1195   PetscInt          *colidxs;
1196   PetscScalar       *matvals;
1197   PetscMPIInt        rank;
1198 
1199   PetscFunctionBegin;
1200   PetscCall(PetscViewerSetUp(viewer));
1201 
1202   M  = mat->rmap->N;
1203   N  = mat->cmap->N;
1204   m  = mat->rmap->n;
1205   rs = mat->rmap->rstart;
1206   cs = mat->cmap->rstart;
1207   nz = A->nz + B->nz;
1208 
1209   /* write matrix header */
1210   header[0] = MAT_FILE_CLASSID;
1211   header[1] = M;
1212   header[2] = N;
1213   PetscCallMPI(MPI_Reduce(&nz, &hnz, 1, MPIU_INT64, MPI_SUM, 0, PetscObjectComm((PetscObject)mat)));
1214   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1215   if (rank == 0) PetscCall(PetscIntCast(hnz, &header[3]));
1216   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1217 
1218   /* fill in and store row lengths  */
1219   PetscCall(PetscMalloc1(m, &rowlens));
1220   for (i = 0; i < m; i++) rowlens[i] = A->i[i + 1] - A->i[i] + B->i[i + 1] - B->i[i];
1221   PetscCall(PetscViewerBinaryWriteAll(viewer, rowlens, m, rs, M, PETSC_INT));
1222   PetscCall(PetscFree(rowlens));
1223 
1224   /* fill in and store column indices */
1225   PetscCall(PetscMalloc1(nz, &colidxs));
1226   for (cnt = 0, i = 0; i < m; i++) {
1227     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1228       if (garray[B->j[jb]] > cs) break;
1229       colidxs[cnt++] = garray[B->j[jb]];
1230     }
1231     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) colidxs[cnt++] = A->j[ja] + cs;
1232     for (; jb < B->i[i + 1]; jb++) colidxs[cnt++] = garray[B->j[jb]];
1233   }
1234   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1235   PetscCall(PetscViewerBinaryWriteAll(viewer, colidxs, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
1236   PetscCall(PetscFree(colidxs));
1237 
1238   /* fill in and store nonzero values */
1239   PetscCall(MatSeqAIJGetArrayRead(aij->A, &aa));
1240   PetscCall(MatSeqAIJGetArrayRead(aij->B, &ba));
1241   PetscCall(PetscMalloc1(nz, &matvals));
1242   for (cnt = 0, i = 0; i < m; i++) {
1243     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1244       if (garray[B->j[jb]] > cs) break;
1245       matvals[cnt++] = ba[jb];
1246     }
1247     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) matvals[cnt++] = aa[ja];
1248     for (; jb < B->i[i + 1]; jb++) matvals[cnt++] = ba[jb];
1249   }
1250   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &aa));
1251   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &ba));
1252   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1253   PetscCall(PetscViewerBinaryWriteAll(viewer, matvals, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
1254   PetscCall(PetscFree(matvals));
1255 
1256   /* write block size option to the viewer's .info file */
1257   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1258   PetscFunctionReturn(PETSC_SUCCESS);
1259 }
1260 
1261 #include <petscdraw.h>
1262 static PetscErrorCode MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
1263 {
1264   Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)mat->data;
1265   PetscMPIInt       rank = aij->rank, size = aij->size;
1266   PetscBool         isdraw, iascii, isbinary;
1267   PetscViewer       sviewer;
1268   PetscViewerFormat format;
1269 
1270   PetscFunctionBegin;
1271   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1272   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1273   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1274   if (iascii) {
1275     PetscCall(PetscViewerGetFormat(viewer, &format));
1276     if (format == PETSC_VIEWER_LOAD_BALANCE) {
1277       PetscInt i, nmax = 0, nmin = PETSC_INT_MAX, navg = 0, *nz, nzlocal = ((Mat_SeqAIJ *)aij->A->data)->nz + ((Mat_SeqAIJ *)aij->B->data)->nz;
1278       PetscCall(PetscMalloc1(size, &nz));
1279       PetscCallMPI(MPI_Allgather(&nzlocal, 1, MPIU_INT, nz, 1, MPIU_INT, PetscObjectComm((PetscObject)mat)));
1280       for (i = 0; i < size; i++) {
1281         nmax = PetscMax(nmax, nz[i]);
1282         nmin = PetscMin(nmin, nz[i]);
1283         navg += nz[i];
1284       }
1285       PetscCall(PetscFree(nz));
1286       navg = navg / size;
1287       PetscCall(PetscViewerASCIIPrintf(viewer, "Load Balance - Nonzeros: Min %" PetscInt_FMT "  avg %" PetscInt_FMT "  max %" PetscInt_FMT "\n", nmin, navg, nmax));
1288       PetscFunctionReturn(PETSC_SUCCESS);
1289     }
1290     PetscCall(PetscViewerGetFormat(viewer, &format));
1291     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1292       MatInfo   info;
1293       PetscInt *inodes = NULL;
1294 
1295       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1296       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
1297       PetscCall(MatInodeGetInodeSizes(aij->A, NULL, &inodes, NULL));
1298       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1299       if (!inodes) {
1300         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, not using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
1301                                                      info.memory));
1302       } else {
1303         PetscCall(
1304           PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated, info.memory));
1305       }
1306       PetscCall(MatGetInfo(aij->A, MAT_LOCAL, &info));
1307       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] on-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1308       PetscCall(MatGetInfo(aij->B, MAT_LOCAL, &info));
1309       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] off-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1310       PetscCall(PetscViewerFlush(viewer));
1311       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1312       PetscCall(PetscViewerASCIIPrintf(viewer, "Information on VecScatter used in matrix-vector product: \n"));
1313       PetscCall(VecScatterView(aij->Mvctx, viewer));
1314       PetscFunctionReturn(PETSC_SUCCESS);
1315     } else if (format == PETSC_VIEWER_ASCII_INFO) {
1316       PetscInt inodecount, inodelimit, *inodes;
1317       PetscCall(MatInodeGetInodeSizes(aij->A, &inodecount, &inodes, &inodelimit));
1318       if (inodes) {
1319         PetscCall(PetscViewerASCIIPrintf(viewer, "using I-node (on process 0) routines: found %" PetscInt_FMT " nodes, limit used is %" PetscInt_FMT "\n", inodecount, inodelimit));
1320       } else {
1321         PetscCall(PetscViewerASCIIPrintf(viewer, "not using I-node (on process 0) routines\n"));
1322       }
1323       PetscFunctionReturn(PETSC_SUCCESS);
1324     } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1325       PetscFunctionReturn(PETSC_SUCCESS);
1326     }
1327   } else if (isbinary) {
1328     if (size == 1) {
1329       PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1330       PetscCall(MatView(aij->A, viewer));
1331     } else {
1332       PetscCall(MatView_MPIAIJ_Binary(mat, viewer));
1333     }
1334     PetscFunctionReturn(PETSC_SUCCESS);
1335   } else if (iascii && size == 1) {
1336     PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1337     PetscCall(MatView(aij->A, viewer));
1338     PetscFunctionReturn(PETSC_SUCCESS);
1339   } else if (isdraw) {
1340     PetscDraw draw;
1341     PetscBool isnull;
1342     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1343     PetscCall(PetscDrawIsNull(draw, &isnull));
1344     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1345   }
1346 
1347   { /* assemble the entire matrix onto first processor */
1348     Mat A = NULL, Av;
1349     IS  isrow, iscol;
1350 
1351     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->rmap->N : 0, 0, 1, &isrow));
1352     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->cmap->N : 0, 0, 1, &iscol));
1353     PetscCall(MatCreateSubMatrix(mat, isrow, iscol, MAT_INITIAL_MATRIX, &A));
1354     PetscCall(MatMPIAIJGetSeqAIJ(A, &Av, NULL, NULL));
1355     /*  The commented code uses MatCreateSubMatrices instead */
1356     /*
1357     Mat *AA, A = NULL, Av;
1358     IS  isrow,iscol;
1359 
1360     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->rmap->N : 0,0,1,&isrow));
1361     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->cmap->N : 0,0,1,&iscol));
1362     PetscCall(MatCreateSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&AA));
1363     if (rank == 0) {
1364        PetscCall(PetscObjectReference((PetscObject)AA[0]));
1365        A    = AA[0];
1366        Av   = AA[0];
1367     }
1368     PetscCall(MatDestroySubMatrices(1,&AA));
1369 */
1370     PetscCall(ISDestroy(&iscol));
1371     PetscCall(ISDestroy(&isrow));
1372     /*
1373        Everyone has to call to draw the matrix since the graphics waits are
1374        synchronized across all processors that share the PetscDraw object
1375     */
1376     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1377     if (rank == 0) {
1378       if (((PetscObject)mat)->name) PetscCall(PetscObjectSetName((PetscObject)Av, ((PetscObject)mat)->name));
1379       PetscCall(MatView_SeqAIJ(Av, sviewer));
1380     }
1381     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1382     PetscCall(MatDestroy(&A));
1383   }
1384   PetscFunctionReturn(PETSC_SUCCESS);
1385 }
1386 
1387 PetscErrorCode MatView_MPIAIJ(Mat mat, PetscViewer viewer)
1388 {
1389   PetscBool iascii, isdraw, issocket, isbinary;
1390 
1391   PetscFunctionBegin;
1392   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1393   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1394   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1395   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
1396   if (iascii || isdraw || isbinary || issocket) PetscCall(MatView_MPIAIJ_ASCIIorDraworSocket(mat, viewer));
1397   PetscFunctionReturn(PETSC_SUCCESS);
1398 }
1399 
1400 static PetscErrorCode MatSOR_MPIAIJ(Mat matin, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1401 {
1402   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)matin->data;
1403   Vec         bb1 = NULL;
1404   PetscBool   hasop;
1405 
1406   PetscFunctionBegin;
1407   if (flag == SOR_APPLY_UPPER) {
1408     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1409     PetscFunctionReturn(PETSC_SUCCESS);
1410   }
1411 
1412   if (its > 1 || ~flag & SOR_ZERO_INITIAL_GUESS || flag & SOR_EISENSTAT) PetscCall(VecDuplicate(bb, &bb1));
1413 
1414   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP) {
1415     if (flag & SOR_ZERO_INITIAL_GUESS) {
1416       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1417       its--;
1418     }
1419 
1420     while (its--) {
1421       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1422       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1423 
1424       /* update rhs: bb1 = bb - B*x */
1425       PetscCall(VecScale(mat->lvec, -1.0));
1426       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1427 
1428       /* local sweep */
1429       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_SYMMETRIC_SWEEP, fshift, lits, 1, xx));
1430     }
1431   } else if (flag & SOR_LOCAL_FORWARD_SWEEP) {
1432     if (flag & SOR_ZERO_INITIAL_GUESS) {
1433       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1434       its--;
1435     }
1436     while (its--) {
1437       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1438       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1439 
1440       /* update rhs: bb1 = bb - B*x */
1441       PetscCall(VecScale(mat->lvec, -1.0));
1442       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1443 
1444       /* local sweep */
1445       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_FORWARD_SWEEP, fshift, lits, 1, xx));
1446     }
1447   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP) {
1448     if (flag & SOR_ZERO_INITIAL_GUESS) {
1449       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1450       its--;
1451     }
1452     while (its--) {
1453       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1454       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1455 
1456       /* update rhs: bb1 = bb - B*x */
1457       PetscCall(VecScale(mat->lvec, -1.0));
1458       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1459 
1460       /* local sweep */
1461       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_BACKWARD_SWEEP, fshift, lits, 1, xx));
1462     }
1463   } else if (flag & SOR_EISENSTAT) {
1464     Vec xx1;
1465 
1466     PetscCall(VecDuplicate(bb, &xx1));
1467     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_BACKWARD_SWEEP), fshift, lits, 1, xx));
1468 
1469     PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1470     PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1471     if (!mat->diag) {
1472       PetscCall(MatCreateVecs(matin, &mat->diag, NULL));
1473       PetscCall(MatGetDiagonal(matin, mat->diag));
1474     }
1475     PetscCall(MatHasOperation(matin, MATOP_MULT_DIAGONAL_BLOCK, &hasop));
1476     if (hasop) {
1477       PetscCall(MatMultDiagonalBlock(matin, xx, bb1));
1478     } else {
1479       PetscCall(VecPointwiseMult(bb1, mat->diag, xx));
1480     }
1481     PetscCall(VecAYPX(bb1, (omega - 2.0) / omega, bb));
1482 
1483     PetscCall(MatMultAdd(mat->B, mat->lvec, bb1, bb1));
1484 
1485     /* local sweep */
1486     PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_FORWARD_SWEEP), fshift, lits, 1, xx1));
1487     PetscCall(VecAXPY(xx, 1.0, xx1));
1488     PetscCall(VecDestroy(&xx1));
1489   } else SETERRQ(PetscObjectComm((PetscObject)matin), PETSC_ERR_SUP, "Parallel SOR not supported");
1490 
1491   PetscCall(VecDestroy(&bb1));
1492 
1493   matin->factorerrortype = mat->A->factorerrortype;
1494   PetscFunctionReturn(PETSC_SUCCESS);
1495 }
1496 
1497 static PetscErrorCode MatPermute_MPIAIJ(Mat A, IS rowp, IS colp, Mat *B)
1498 {
1499   Mat             aA, aB, Aperm;
1500   const PetscInt *rwant, *cwant, *gcols, *ai, *bi, *aj, *bj;
1501   PetscScalar    *aa, *ba;
1502   PetscInt        i, j, m, n, ng, anz, bnz, *dnnz, *onnz, *tdnnz, *tonnz, *rdest, *cdest, *work, *gcdest;
1503   PetscSF         rowsf, sf;
1504   IS              parcolp = NULL;
1505   PetscBool       done;
1506 
1507   PetscFunctionBegin;
1508   PetscCall(MatGetLocalSize(A, &m, &n));
1509   PetscCall(ISGetIndices(rowp, &rwant));
1510   PetscCall(ISGetIndices(colp, &cwant));
1511   PetscCall(PetscMalloc3(PetscMax(m, n), &work, m, &rdest, n, &cdest));
1512 
1513   /* Invert row permutation to find out where my rows should go */
1514   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &rowsf));
1515   PetscCall(PetscSFSetGraphLayout(rowsf, A->rmap, A->rmap->n, NULL, PETSC_OWN_POINTER, rwant));
1516   PetscCall(PetscSFSetFromOptions(rowsf));
1517   for (i = 0; i < m; i++) work[i] = A->rmap->rstart + i;
1518   PetscCall(PetscSFReduceBegin(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1519   PetscCall(PetscSFReduceEnd(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1520 
1521   /* Invert column permutation to find out where my columns should go */
1522   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1523   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, A->cmap->n, NULL, PETSC_OWN_POINTER, cwant));
1524   PetscCall(PetscSFSetFromOptions(sf));
1525   for (i = 0; i < n; i++) work[i] = A->cmap->rstart + i;
1526   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1527   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1528   PetscCall(PetscSFDestroy(&sf));
1529 
1530   PetscCall(ISRestoreIndices(rowp, &rwant));
1531   PetscCall(ISRestoreIndices(colp, &cwant));
1532   PetscCall(MatMPIAIJGetSeqAIJ(A, &aA, &aB, &gcols));
1533 
1534   /* Find out where my gcols should go */
1535   PetscCall(MatGetSize(aB, NULL, &ng));
1536   PetscCall(PetscMalloc1(ng, &gcdest));
1537   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1538   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, ng, NULL, PETSC_OWN_POINTER, gcols));
1539   PetscCall(PetscSFSetFromOptions(sf));
1540   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1541   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1542   PetscCall(PetscSFDestroy(&sf));
1543 
1544   PetscCall(PetscCalloc4(m, &dnnz, m, &onnz, m, &tdnnz, m, &tonnz));
1545   PetscCall(MatGetRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1546   PetscCall(MatGetRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1547   for (i = 0; i < m; i++) {
1548     PetscInt    row = rdest[i];
1549     PetscMPIInt rowner;
1550     PetscCall(PetscLayoutFindOwner(A->rmap, row, &rowner));
1551     for (j = ai[i]; j < ai[i + 1]; j++) {
1552       PetscInt    col = cdest[aj[j]];
1553       PetscMPIInt cowner;
1554       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner)); /* Could build an index for the columns to eliminate this search */
1555       if (rowner == cowner) dnnz[i]++;
1556       else onnz[i]++;
1557     }
1558     for (j = bi[i]; j < bi[i + 1]; j++) {
1559       PetscInt    col = gcdest[bj[j]];
1560       PetscMPIInt cowner;
1561       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner));
1562       if (rowner == cowner) dnnz[i]++;
1563       else onnz[i]++;
1564     }
1565   }
1566   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1567   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1568   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1569   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1570   PetscCall(PetscSFDestroy(&rowsf));
1571 
1572   PetscCall(MatCreateAIJ(PetscObjectComm((PetscObject)A), A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N, 0, tdnnz, 0, tonnz, &Aperm));
1573   PetscCall(MatSeqAIJGetArray(aA, &aa));
1574   PetscCall(MatSeqAIJGetArray(aB, &ba));
1575   for (i = 0; i < m; i++) {
1576     PetscInt *acols = dnnz, *bcols = onnz; /* Repurpose now-unneeded arrays */
1577     PetscInt  j0, rowlen;
1578     rowlen = ai[i + 1] - ai[i];
1579     for (j0 = j = 0; j < rowlen; j0 = j) { /* rowlen could be larger than number of rows m, so sum in batches */
1580       for (; j < PetscMin(rowlen, j0 + m); j++) acols[j - j0] = cdest[aj[ai[i] + j]];
1581       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, acols, aa + ai[i] + j0, INSERT_VALUES));
1582     }
1583     rowlen = bi[i + 1] - bi[i];
1584     for (j0 = j = 0; j < rowlen; j0 = j) {
1585       for (; j < PetscMin(rowlen, j0 + m); j++) bcols[j - j0] = gcdest[bj[bi[i] + j]];
1586       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, bcols, ba + bi[i] + j0, INSERT_VALUES));
1587     }
1588   }
1589   PetscCall(MatAssemblyBegin(Aperm, MAT_FINAL_ASSEMBLY));
1590   PetscCall(MatAssemblyEnd(Aperm, MAT_FINAL_ASSEMBLY));
1591   PetscCall(MatRestoreRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1592   PetscCall(MatRestoreRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1593   PetscCall(MatSeqAIJRestoreArray(aA, &aa));
1594   PetscCall(MatSeqAIJRestoreArray(aB, &ba));
1595   PetscCall(PetscFree4(dnnz, onnz, tdnnz, tonnz));
1596   PetscCall(PetscFree3(work, rdest, cdest));
1597   PetscCall(PetscFree(gcdest));
1598   if (parcolp) PetscCall(ISDestroy(&colp));
1599   *B = Aperm;
1600   PetscFunctionReturn(PETSC_SUCCESS);
1601 }
1602 
1603 static PetscErrorCode MatGetGhosts_MPIAIJ(Mat mat, PetscInt *nghosts, const PetscInt *ghosts[])
1604 {
1605   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1606 
1607   PetscFunctionBegin;
1608   PetscCall(MatGetSize(aij->B, NULL, nghosts));
1609   if (ghosts) *ghosts = aij->garray;
1610   PetscFunctionReturn(PETSC_SUCCESS);
1611 }
1612 
1613 static PetscErrorCode MatGetInfo_MPIAIJ(Mat matin, MatInfoType flag, MatInfo *info)
1614 {
1615   Mat_MPIAIJ    *mat = (Mat_MPIAIJ *)matin->data;
1616   Mat            A = mat->A, B = mat->B;
1617   PetscLogDouble isend[5], irecv[5];
1618 
1619   PetscFunctionBegin;
1620   info->block_size = 1.0;
1621   PetscCall(MatGetInfo(A, MAT_LOCAL, info));
1622 
1623   isend[0] = info->nz_used;
1624   isend[1] = info->nz_allocated;
1625   isend[2] = info->nz_unneeded;
1626   isend[3] = info->memory;
1627   isend[4] = info->mallocs;
1628 
1629   PetscCall(MatGetInfo(B, MAT_LOCAL, info));
1630 
1631   isend[0] += info->nz_used;
1632   isend[1] += info->nz_allocated;
1633   isend[2] += info->nz_unneeded;
1634   isend[3] += info->memory;
1635   isend[4] += info->mallocs;
1636   if (flag == MAT_LOCAL) {
1637     info->nz_used      = isend[0];
1638     info->nz_allocated = isend[1];
1639     info->nz_unneeded  = isend[2];
1640     info->memory       = isend[3];
1641     info->mallocs      = isend[4];
1642   } else if (flag == MAT_GLOBAL_MAX) {
1643     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)matin)));
1644 
1645     info->nz_used      = irecv[0];
1646     info->nz_allocated = irecv[1];
1647     info->nz_unneeded  = irecv[2];
1648     info->memory       = irecv[3];
1649     info->mallocs      = irecv[4];
1650   } else if (flag == MAT_GLOBAL_SUM) {
1651     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)matin)));
1652 
1653     info->nz_used      = irecv[0];
1654     info->nz_allocated = irecv[1];
1655     info->nz_unneeded  = irecv[2];
1656     info->memory       = irecv[3];
1657     info->mallocs      = irecv[4];
1658   }
1659   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1660   info->fill_ratio_needed = 0;
1661   info->factor_mallocs    = 0;
1662   PetscFunctionReturn(PETSC_SUCCESS);
1663 }
1664 
1665 PetscErrorCode MatSetOption_MPIAIJ(Mat A, MatOption op, PetscBool flg)
1666 {
1667   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1668 
1669   PetscFunctionBegin;
1670   switch (op) {
1671   case MAT_NEW_NONZERO_LOCATIONS:
1672   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1673   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1674   case MAT_KEEP_NONZERO_PATTERN:
1675   case MAT_NEW_NONZERO_LOCATION_ERR:
1676   case MAT_USE_INODES:
1677   case MAT_IGNORE_ZERO_ENTRIES:
1678   case MAT_FORM_EXPLICIT_TRANSPOSE:
1679     MatCheckPreallocated(A, 1);
1680     PetscCall(MatSetOption(a->A, op, flg));
1681     PetscCall(MatSetOption(a->B, op, flg));
1682     break;
1683   case MAT_ROW_ORIENTED:
1684     MatCheckPreallocated(A, 1);
1685     a->roworiented = flg;
1686 
1687     PetscCall(MatSetOption(a->A, op, flg));
1688     PetscCall(MatSetOption(a->B, op, flg));
1689     break;
1690   case MAT_FORCE_DIAGONAL_ENTRIES:
1691   case MAT_SORTED_FULL:
1692     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
1693     break;
1694   case MAT_IGNORE_OFF_PROC_ENTRIES:
1695     a->donotstash = flg;
1696     break;
1697   /* Symmetry flags are handled directly by MatSetOption() and they don't affect preallocation */
1698   case MAT_SPD:
1699   case MAT_SYMMETRIC:
1700   case MAT_STRUCTURALLY_SYMMETRIC:
1701   case MAT_HERMITIAN:
1702   case MAT_SYMMETRY_ETERNAL:
1703   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1704   case MAT_SPD_ETERNAL:
1705     /* if the diagonal matrix is square it inherits some of the properties above */
1706     if (a->A && A->rmap->n == A->cmap->n) PetscCall(MatSetOption(a->A, op, flg));
1707     break;
1708   case MAT_SUBMAT_SINGLEIS:
1709     A->submat_singleis = flg;
1710     break;
1711   case MAT_STRUCTURE_ONLY:
1712     /* The option is handled directly by MatSetOption() */
1713     break;
1714   default:
1715     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
1716   }
1717   PetscFunctionReturn(PETSC_SUCCESS);
1718 }
1719 
1720 PetscErrorCode MatGetRow_MPIAIJ(Mat matin, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1721 {
1722   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)matin->data;
1723   PetscScalar *vworkA, *vworkB, **pvA, **pvB, *v_p;
1724   PetscInt     i, *cworkA, *cworkB, **pcA, **pcB, cstart = matin->cmap->rstart;
1725   PetscInt     nztot, nzA, nzB, lrow, rstart = matin->rmap->rstart, rend = matin->rmap->rend;
1726   PetscInt    *cmap, *idx_p;
1727 
1728   PetscFunctionBegin;
1729   PetscCheck(!mat->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Already active");
1730   mat->getrowactive = PETSC_TRUE;
1731 
1732   if (!mat->rowvalues && (idx || v)) {
1733     /*
1734         allocate enough space to hold information from the longest row.
1735     */
1736     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *)mat->A->data, *Ba = (Mat_SeqAIJ *)mat->B->data;
1737     PetscInt    max = 1, tmp;
1738     for (i = 0; i < matin->rmap->n; i++) {
1739       tmp = Aa->i[i + 1] - Aa->i[i] + Ba->i[i + 1] - Ba->i[i];
1740       if (max < tmp) max = tmp;
1741     }
1742     PetscCall(PetscMalloc2(max, &mat->rowvalues, max, &mat->rowindices));
1743   }
1744 
1745   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only local rows");
1746   lrow = row - rstart;
1747 
1748   pvA = &vworkA;
1749   pcA = &cworkA;
1750   pvB = &vworkB;
1751   pcB = &cworkB;
1752   if (!v) {
1753     pvA = NULL;
1754     pvB = NULL;
1755   }
1756   if (!idx) {
1757     pcA = NULL;
1758     if (!v) pcB = NULL;
1759   }
1760   PetscCall((*mat->A->ops->getrow)(mat->A, lrow, &nzA, pcA, pvA));
1761   PetscCall((*mat->B->ops->getrow)(mat->B, lrow, &nzB, pcB, pvB));
1762   nztot = nzA + nzB;
1763 
1764   cmap = mat->garray;
1765   if (v || idx) {
1766     if (nztot) {
1767       /* Sort by increasing column numbers, assuming A and B already sorted */
1768       PetscInt imark = -1;
1769       if (v) {
1770         *v = v_p = mat->rowvalues;
1771         for (i = 0; i < nzB; i++) {
1772           if (cmap[cworkB[i]] < cstart) v_p[i] = vworkB[i];
1773           else break;
1774         }
1775         imark = i;
1776         for (i = 0; i < nzA; i++) v_p[imark + i] = vworkA[i];
1777         for (i = imark; i < nzB; i++) v_p[nzA + i] = vworkB[i];
1778       }
1779       if (idx) {
1780         *idx = idx_p = mat->rowindices;
1781         if (imark > -1) {
1782           for (i = 0; i < imark; i++) idx_p[i] = cmap[cworkB[i]];
1783         } else {
1784           for (i = 0; i < nzB; i++) {
1785             if (cmap[cworkB[i]] < cstart) idx_p[i] = cmap[cworkB[i]];
1786             else break;
1787           }
1788           imark = i;
1789         }
1790         for (i = 0; i < nzA; i++) idx_p[imark + i] = cstart + cworkA[i];
1791         for (i = imark; i < nzB; i++) idx_p[nzA + i] = cmap[cworkB[i]];
1792       }
1793     } else {
1794       if (idx) *idx = NULL;
1795       if (v) *v = NULL;
1796     }
1797   }
1798   *nz = nztot;
1799   PetscCall((*mat->A->ops->restorerow)(mat->A, lrow, &nzA, pcA, pvA));
1800   PetscCall((*mat->B->ops->restorerow)(mat->B, lrow, &nzB, pcB, pvB));
1801   PetscFunctionReturn(PETSC_SUCCESS);
1802 }
1803 
1804 PetscErrorCode MatRestoreRow_MPIAIJ(Mat mat, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1805 {
1806   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1807 
1808   PetscFunctionBegin;
1809   PetscCheck(aij->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "MatGetRow() must be called first");
1810   aij->getrowactive = PETSC_FALSE;
1811   PetscFunctionReturn(PETSC_SUCCESS);
1812 }
1813 
1814 static PetscErrorCode MatNorm_MPIAIJ(Mat mat, NormType type, PetscReal *norm)
1815 {
1816   Mat_MPIAIJ      *aij  = (Mat_MPIAIJ *)mat->data;
1817   Mat_SeqAIJ      *amat = (Mat_SeqAIJ *)aij->A->data, *bmat = (Mat_SeqAIJ *)aij->B->data;
1818   PetscInt         i, j, cstart = mat->cmap->rstart;
1819   PetscReal        sum = 0.0;
1820   const MatScalar *v, *amata, *bmata;
1821 
1822   PetscFunctionBegin;
1823   if (aij->size == 1) {
1824     PetscCall(MatNorm(aij->A, type, norm));
1825   } else {
1826     PetscCall(MatSeqAIJGetArrayRead(aij->A, &amata));
1827     PetscCall(MatSeqAIJGetArrayRead(aij->B, &bmata));
1828     if (type == NORM_FROBENIUS) {
1829       v = amata;
1830       for (i = 0; i < amat->nz; i++) {
1831         sum += PetscRealPart(PetscConj(*v) * (*v));
1832         v++;
1833       }
1834       v = bmata;
1835       for (i = 0; i < bmat->nz; i++) {
1836         sum += PetscRealPart(PetscConj(*v) * (*v));
1837         v++;
1838       }
1839       PetscCallMPI(MPIU_Allreduce(&sum, norm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1840       *norm = PetscSqrtReal(*norm);
1841       PetscCall(PetscLogFlops(2.0 * amat->nz + 2.0 * bmat->nz));
1842     } else if (type == NORM_1) { /* max column norm */
1843       PetscReal *tmp;
1844       PetscInt  *jj, *garray = aij->garray;
1845       PetscCall(PetscCalloc1(mat->cmap->N + 1, &tmp));
1846       *norm = 0.0;
1847       v     = amata;
1848       jj    = amat->j;
1849       for (j = 0; j < amat->nz; j++) {
1850         tmp[cstart + *jj++] += PetscAbsScalar(*v);
1851         v++;
1852       }
1853       v  = bmata;
1854       jj = bmat->j;
1855       for (j = 0; j < bmat->nz; j++) {
1856         tmp[garray[*jj++]] += PetscAbsScalar(*v);
1857         v++;
1858       }
1859       PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, tmp, mat->cmap->N, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1860       for (j = 0; j < mat->cmap->N; j++) {
1861         if (tmp[j] > *norm) *norm = tmp[j];
1862       }
1863       PetscCall(PetscFree(tmp));
1864       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1865     } else if (type == NORM_INFINITY) { /* max row norm */
1866       PetscReal ntemp = 0.0;
1867       for (j = 0; j < aij->A->rmap->n; j++) {
1868         v   = PetscSafePointerPlusOffset(amata, amat->i[j]);
1869         sum = 0.0;
1870         for (i = 0; i < amat->i[j + 1] - amat->i[j]; i++) {
1871           sum += PetscAbsScalar(*v);
1872           v++;
1873         }
1874         v = PetscSafePointerPlusOffset(bmata, bmat->i[j]);
1875         for (i = 0; i < bmat->i[j + 1] - bmat->i[j]; i++) {
1876           sum += PetscAbsScalar(*v);
1877           v++;
1878         }
1879         if (sum > ntemp) ntemp = sum;
1880       }
1881       PetscCallMPI(MPIU_Allreduce(&ntemp, norm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)mat)));
1882       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1883     } else SETERRQ(PetscObjectComm((PetscObject)mat), PETSC_ERR_SUP, "No support for two norm");
1884     PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &amata));
1885     PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &bmata));
1886   }
1887   PetscFunctionReturn(PETSC_SUCCESS);
1888 }
1889 
1890 static PetscErrorCode MatTranspose_MPIAIJ(Mat A, MatReuse reuse, Mat *matout)
1891 {
1892   Mat_MPIAIJ      *a    = (Mat_MPIAIJ *)A->data, *b;
1893   Mat_SeqAIJ      *Aloc = (Mat_SeqAIJ *)a->A->data, *Bloc = (Mat_SeqAIJ *)a->B->data, *sub_B_diag;
1894   PetscInt         M = A->rmap->N, N = A->cmap->N, ma, na, mb, nb, row, *cols, *cols_tmp, *B_diag_ilen, i, ncol, A_diag_ncol;
1895   const PetscInt  *ai, *aj, *bi, *bj, *B_diag_i;
1896   Mat              B, A_diag, *B_diag;
1897   const MatScalar *pbv, *bv;
1898 
1899   PetscFunctionBegin;
1900   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1901   ma = A->rmap->n;
1902   na = A->cmap->n;
1903   mb = a->B->rmap->n;
1904   nb = a->B->cmap->n;
1905   ai = Aloc->i;
1906   aj = Aloc->j;
1907   bi = Bloc->i;
1908   bj = Bloc->j;
1909   if (reuse == MAT_INITIAL_MATRIX || *matout == A) {
1910     PetscInt            *d_nnz, *g_nnz, *o_nnz;
1911     PetscSFNode         *oloc;
1912     PETSC_UNUSED PetscSF sf;
1913 
1914     PetscCall(PetscMalloc4(na, &d_nnz, na, &o_nnz, nb, &g_nnz, nb, &oloc));
1915     /* compute d_nnz for preallocation */
1916     PetscCall(PetscArrayzero(d_nnz, na));
1917     for (i = 0; i < ai[ma]; i++) d_nnz[aj[i]]++;
1918     /* compute local off-diagonal contributions */
1919     PetscCall(PetscArrayzero(g_nnz, nb));
1920     for (i = 0; i < bi[ma]; i++) g_nnz[bj[i]]++;
1921     /* map those to global */
1922     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1923     PetscCall(PetscSFSetGraphLayout(sf, A->cmap, nb, NULL, PETSC_USE_POINTER, a->garray));
1924     PetscCall(PetscSFSetFromOptions(sf));
1925     PetscCall(PetscArrayzero(o_nnz, na));
1926     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1927     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1928     PetscCall(PetscSFDestroy(&sf));
1929 
1930     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1931     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1932     PetscCall(MatSetBlockSizes(B, PetscAbs(A->cmap->bs), PetscAbs(A->rmap->bs)));
1933     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1934     PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
1935     PetscCall(PetscFree4(d_nnz, o_nnz, g_nnz, oloc));
1936   } else {
1937     B = *matout;
1938     PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
1939   }
1940 
1941   b           = (Mat_MPIAIJ *)B->data;
1942   A_diag      = a->A;
1943   B_diag      = &b->A;
1944   sub_B_diag  = (Mat_SeqAIJ *)(*B_diag)->data;
1945   A_diag_ncol = A_diag->cmap->N;
1946   B_diag_ilen = sub_B_diag->ilen;
1947   B_diag_i    = sub_B_diag->i;
1948 
1949   /* Set ilen for diagonal of B */
1950   for (i = 0; i < A_diag_ncol; i++) B_diag_ilen[i] = B_diag_i[i + 1] - B_diag_i[i];
1951 
1952   /* Transpose the diagonal part of the matrix. In contrast to the off-diagonal part, this can be done
1953   very quickly (=without using MatSetValues), because all writes are local. */
1954   PetscCall(MatTransposeSetPrecursor(A_diag, *B_diag));
1955   PetscCall(MatTranspose(A_diag, MAT_REUSE_MATRIX, B_diag));
1956 
1957   /* copy over the B part */
1958   PetscCall(PetscMalloc1(bi[mb], &cols));
1959   PetscCall(MatSeqAIJGetArrayRead(a->B, &bv));
1960   pbv = bv;
1961   row = A->rmap->rstart;
1962   for (i = 0; i < bi[mb]; i++) cols[i] = a->garray[bj[i]];
1963   cols_tmp = cols;
1964   for (i = 0; i < mb; i++) {
1965     ncol = bi[i + 1] - bi[i];
1966     PetscCall(MatSetValues(B, ncol, cols_tmp, 1, &row, pbv, INSERT_VALUES));
1967     row++;
1968     if (pbv) pbv += ncol;
1969     if (cols_tmp) cols_tmp += ncol;
1970   }
1971   PetscCall(PetscFree(cols));
1972   PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bv));
1973 
1974   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1975   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1976   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1977     *matout = B;
1978   } else {
1979     PetscCall(MatHeaderMerge(A, &B));
1980   }
1981   PetscFunctionReturn(PETSC_SUCCESS);
1982 }
1983 
1984 static PetscErrorCode MatDiagonalScale_MPIAIJ(Mat mat, Vec ll, Vec rr)
1985 {
1986   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1987   Mat         a = aij->A, b = aij->B;
1988   PetscInt    s1, s2, s3;
1989 
1990   PetscFunctionBegin;
1991   PetscCall(MatGetLocalSize(mat, &s2, &s3));
1992   if (rr) {
1993     PetscCall(VecGetLocalSize(rr, &s1));
1994     PetscCheck(s1 == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "right vector non-conforming local size");
1995     /* Overlap communication with computation. */
1996     PetscCall(VecScatterBegin(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
1997   }
1998   if (ll) {
1999     PetscCall(VecGetLocalSize(ll, &s1));
2000     PetscCheck(s1 == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "left vector non-conforming local size");
2001     PetscUseTypeMethod(b, diagonalscale, ll, NULL);
2002   }
2003   /* scale  the diagonal block */
2004   PetscUseTypeMethod(a, diagonalscale, ll, rr);
2005 
2006   if (rr) {
2007     /* Do a scatter end and then right scale the off-diagonal block */
2008     PetscCall(VecScatterEnd(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
2009     PetscUseTypeMethod(b, diagonalscale, NULL, aij->lvec);
2010   }
2011   PetscFunctionReturn(PETSC_SUCCESS);
2012 }
2013 
2014 static PetscErrorCode MatSetUnfactored_MPIAIJ(Mat A)
2015 {
2016   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2017 
2018   PetscFunctionBegin;
2019   PetscCall(MatSetUnfactored(a->A));
2020   PetscFunctionReturn(PETSC_SUCCESS);
2021 }
2022 
2023 static PetscErrorCode MatEqual_MPIAIJ(Mat A, Mat B, PetscBool *flag)
2024 {
2025   Mat_MPIAIJ *matB = (Mat_MPIAIJ *)B->data, *matA = (Mat_MPIAIJ *)A->data;
2026   Mat         a, b, c, d;
2027   PetscBool   flg;
2028 
2029   PetscFunctionBegin;
2030   a = matA->A;
2031   b = matA->B;
2032   c = matB->A;
2033   d = matB->B;
2034 
2035   PetscCall(MatEqual(a, c, &flg));
2036   if (flg) PetscCall(MatEqual(b, d, &flg));
2037   PetscCallMPI(MPIU_Allreduce(&flg, flag, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2038   PetscFunctionReturn(PETSC_SUCCESS);
2039 }
2040 
2041 static PetscErrorCode MatCopy_MPIAIJ(Mat A, Mat B, MatStructure str)
2042 {
2043   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2044   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2045 
2046   PetscFunctionBegin;
2047   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
2048   if ((str != SAME_NONZERO_PATTERN) || (A->ops->copy != B->ops->copy)) {
2049     /* because of the column compression in the off-processor part of the matrix a->B,
2050        the number of columns in a->B and b->B may be different, hence we cannot call
2051        the MatCopy() directly on the two parts. If need be, we can provide a more
2052        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
2053        then copying the submatrices */
2054     PetscCall(MatCopy_Basic(A, B, str));
2055   } else {
2056     PetscCall(MatCopy(a->A, b->A, str));
2057     PetscCall(MatCopy(a->B, b->B, str));
2058   }
2059   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2060   PetscFunctionReturn(PETSC_SUCCESS);
2061 }
2062 
2063 /*
2064    Computes the number of nonzeros per row needed for preallocation when X and Y
2065    have different nonzero structure.
2066 */
2067 PetscErrorCode MatAXPYGetPreallocation_MPIX_private(PetscInt m, const PetscInt *xi, const PetscInt *xj, const PetscInt *xltog, const PetscInt *yi, const PetscInt *yj, const PetscInt *yltog, PetscInt *nnz)
2068 {
2069   PetscInt i, j, k, nzx, nzy;
2070 
2071   PetscFunctionBegin;
2072   /* Set the number of nonzeros in the new matrix */
2073   for (i = 0; i < m; i++) {
2074     const PetscInt *xjj = PetscSafePointerPlusOffset(xj, xi[i]), *yjj = PetscSafePointerPlusOffset(yj, yi[i]);
2075     nzx    = xi[i + 1] - xi[i];
2076     nzy    = yi[i + 1] - yi[i];
2077     nnz[i] = 0;
2078     for (j = 0, k = 0; j < nzx; j++) {                                /* Point in X */
2079       for (; k < nzy && yltog[yjj[k]] < xltog[xjj[j]]; k++) nnz[i]++; /* Catch up to X */
2080       if (k < nzy && yltog[yjj[k]] == xltog[xjj[j]]) k++;             /* Skip duplicate */
2081       nnz[i]++;
2082     }
2083     for (; k < nzy; k++) nnz[i]++;
2084   }
2085   PetscFunctionReturn(PETSC_SUCCESS);
2086 }
2087 
2088 /* This is the same as MatAXPYGetPreallocation_SeqAIJ, except that the local-to-global map is provided */
2089 static PetscErrorCode MatAXPYGetPreallocation_MPIAIJ(Mat Y, const PetscInt *yltog, Mat X, const PetscInt *xltog, PetscInt *nnz)
2090 {
2091   PetscInt    m = Y->rmap->N;
2092   Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data;
2093   Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
2094 
2095   PetscFunctionBegin;
2096   PetscCall(MatAXPYGetPreallocation_MPIX_private(m, x->i, x->j, xltog, y->i, y->j, yltog, nnz));
2097   PetscFunctionReturn(PETSC_SUCCESS);
2098 }
2099 
2100 static PetscErrorCode MatAXPY_MPIAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2101 {
2102   Mat_MPIAIJ *xx = (Mat_MPIAIJ *)X->data, *yy = (Mat_MPIAIJ *)Y->data;
2103 
2104   PetscFunctionBegin;
2105   if (str == SAME_NONZERO_PATTERN) {
2106     PetscCall(MatAXPY(yy->A, a, xx->A, str));
2107     PetscCall(MatAXPY(yy->B, a, xx->B, str));
2108   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2109     PetscCall(MatAXPY_Basic(Y, a, X, str));
2110   } else {
2111     Mat       B;
2112     PetscInt *nnz_d, *nnz_o;
2113 
2114     PetscCall(PetscMalloc1(yy->A->rmap->N, &nnz_d));
2115     PetscCall(PetscMalloc1(yy->B->rmap->N, &nnz_o));
2116     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2117     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2118     PetscCall(MatSetLayouts(B, Y->rmap, Y->cmap));
2119     PetscCall(MatSetType(B, ((PetscObject)Y)->type_name));
2120     PetscCall(MatAXPYGetPreallocation_SeqAIJ(yy->A, xx->A, nnz_d));
2121     PetscCall(MatAXPYGetPreallocation_MPIAIJ(yy->B, yy->garray, xx->B, xx->garray, nnz_o));
2122     PetscCall(MatMPIAIJSetPreallocation(B, 0, nnz_d, 0, nnz_o));
2123     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2124     PetscCall(MatHeaderMerge(Y, &B));
2125     PetscCall(PetscFree(nnz_d));
2126     PetscCall(PetscFree(nnz_o));
2127   }
2128   PetscFunctionReturn(PETSC_SUCCESS);
2129 }
2130 
2131 PETSC_INTERN PetscErrorCode MatConjugate_SeqAIJ(Mat);
2132 
2133 static PetscErrorCode MatConjugate_MPIAIJ(Mat mat)
2134 {
2135   PetscFunctionBegin;
2136   if (PetscDefined(USE_COMPLEX)) {
2137     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2138 
2139     PetscCall(MatConjugate_SeqAIJ(aij->A));
2140     PetscCall(MatConjugate_SeqAIJ(aij->B));
2141   }
2142   PetscFunctionReturn(PETSC_SUCCESS);
2143 }
2144 
2145 static PetscErrorCode MatRealPart_MPIAIJ(Mat A)
2146 {
2147   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2148 
2149   PetscFunctionBegin;
2150   PetscCall(MatRealPart(a->A));
2151   PetscCall(MatRealPart(a->B));
2152   PetscFunctionReturn(PETSC_SUCCESS);
2153 }
2154 
2155 static PetscErrorCode MatImaginaryPart_MPIAIJ(Mat A)
2156 {
2157   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2158 
2159   PetscFunctionBegin;
2160   PetscCall(MatImaginaryPart(a->A));
2161   PetscCall(MatImaginaryPart(a->B));
2162   PetscFunctionReturn(PETSC_SUCCESS);
2163 }
2164 
2165 static PetscErrorCode MatGetRowMaxAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2166 {
2167   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
2168   PetscInt           i, *idxb = NULL, m = A->rmap->n;
2169   PetscScalar       *vv;
2170   Vec                vB, vA;
2171   const PetscScalar *va, *vb;
2172 
2173   PetscFunctionBegin;
2174   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2175   PetscCall(MatGetRowMaxAbs(a->A, vA, idx));
2176 
2177   PetscCall(VecGetArrayRead(vA, &va));
2178   if (idx) {
2179     for (i = 0; i < m; i++) {
2180       if (PetscAbsScalar(va[i])) idx[i] += A->cmap->rstart;
2181     }
2182   }
2183 
2184   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2185   PetscCall(PetscMalloc1(m, &idxb));
2186   PetscCall(MatGetRowMaxAbs(a->B, vB, idxb));
2187 
2188   PetscCall(VecGetArrayWrite(v, &vv));
2189   PetscCall(VecGetArrayRead(vB, &vb));
2190   for (i = 0; i < m; i++) {
2191     if (PetscAbsScalar(va[i]) < PetscAbsScalar(vb[i])) {
2192       vv[i] = vb[i];
2193       if (idx) idx[i] = a->garray[idxb[i]];
2194     } else {
2195       vv[i] = va[i];
2196       if (idx && PetscAbsScalar(va[i]) == PetscAbsScalar(vb[i]) && idxb[i] != -1 && idx[i] > a->garray[idxb[i]]) idx[i] = a->garray[idxb[i]];
2197     }
2198   }
2199   PetscCall(VecRestoreArrayWrite(v, &vv));
2200   PetscCall(VecRestoreArrayRead(vA, &va));
2201   PetscCall(VecRestoreArrayRead(vB, &vb));
2202   PetscCall(PetscFree(idxb));
2203   PetscCall(VecDestroy(&vA));
2204   PetscCall(VecDestroy(&vB));
2205   PetscFunctionReturn(PETSC_SUCCESS);
2206 }
2207 
2208 static PetscErrorCode MatGetRowSumAbs_MPIAIJ(Mat A, Vec v)
2209 {
2210   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2211   Vec         vB, vA;
2212 
2213   PetscFunctionBegin;
2214   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2215   PetscCall(MatGetRowSumAbs(a->A, vA));
2216   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2217   PetscCall(MatGetRowSumAbs(a->B, vB));
2218   PetscCall(VecAXPY(vA, 1.0, vB));
2219   PetscCall(VecDestroy(&vB));
2220   PetscCall(VecCopy(vA, v));
2221   PetscCall(VecDestroy(&vA));
2222   PetscFunctionReturn(PETSC_SUCCESS);
2223 }
2224 
2225 static PetscErrorCode MatGetRowMinAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2226 {
2227   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2228   PetscInt           m = A->rmap->n, n = A->cmap->n;
2229   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2230   PetscInt          *cmap = mat->garray;
2231   PetscInt          *diagIdx, *offdiagIdx;
2232   Vec                diagV, offdiagV;
2233   PetscScalar       *a, *diagA, *offdiagA;
2234   const PetscScalar *ba, *bav;
2235   PetscInt           r, j, col, ncols, *bi, *bj;
2236   Mat                B = mat->B;
2237   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2238 
2239   PetscFunctionBegin;
2240   /* When a process holds entire A and other processes have no entry */
2241   if (A->cmap->N == n) {
2242     PetscCall(VecGetArrayWrite(v, &diagA));
2243     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2244     PetscCall(MatGetRowMinAbs(mat->A, diagV, idx));
2245     PetscCall(VecDestroy(&diagV));
2246     PetscCall(VecRestoreArrayWrite(v, &diagA));
2247     PetscFunctionReturn(PETSC_SUCCESS);
2248   } else if (n == 0) {
2249     if (m) {
2250       PetscCall(VecGetArrayWrite(v, &a));
2251       for (r = 0; r < m; r++) {
2252         a[r] = 0.0;
2253         if (idx) idx[r] = -1;
2254       }
2255       PetscCall(VecRestoreArrayWrite(v, &a));
2256     }
2257     PetscFunctionReturn(PETSC_SUCCESS);
2258   }
2259 
2260   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2261   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2262   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2263   PetscCall(MatGetRowMinAbs(mat->A, diagV, diagIdx));
2264 
2265   /* Get offdiagIdx[] for implicit 0.0 */
2266   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2267   ba = bav;
2268   bi = b->i;
2269   bj = b->j;
2270   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2271   for (r = 0; r < m; r++) {
2272     ncols = bi[r + 1] - bi[r];
2273     if (ncols == A->cmap->N - n) { /* Brow is dense */
2274       offdiagA[r]   = *ba;
2275       offdiagIdx[r] = cmap[0];
2276     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2277       offdiagA[r] = 0.0;
2278 
2279       /* Find first hole in the cmap */
2280       for (j = 0; j < ncols; j++) {
2281         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2282         if (col > j && j < cstart) {
2283           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2284           break;
2285         } else if (col > j + n && j >= cstart) {
2286           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2287           break;
2288         }
2289       }
2290       if (j == ncols && ncols < A->cmap->N - n) {
2291         /* a hole is outside compressed Bcols */
2292         if (ncols == 0) {
2293           if (cstart) {
2294             offdiagIdx[r] = 0;
2295           } else offdiagIdx[r] = cend;
2296         } else { /* ncols > 0 */
2297           offdiagIdx[r] = cmap[ncols - 1] + 1;
2298           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2299         }
2300       }
2301     }
2302 
2303     for (j = 0; j < ncols; j++) {
2304       if (PetscAbsScalar(offdiagA[r]) > PetscAbsScalar(*ba)) {
2305         offdiagA[r]   = *ba;
2306         offdiagIdx[r] = cmap[*bj];
2307       }
2308       ba++;
2309       bj++;
2310     }
2311   }
2312 
2313   PetscCall(VecGetArrayWrite(v, &a));
2314   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2315   for (r = 0; r < m; ++r) {
2316     if (PetscAbsScalar(diagA[r]) < PetscAbsScalar(offdiagA[r])) {
2317       a[r] = diagA[r];
2318       if (idx) idx[r] = cstart + diagIdx[r];
2319     } else if (PetscAbsScalar(diagA[r]) == PetscAbsScalar(offdiagA[r])) {
2320       a[r] = diagA[r];
2321       if (idx) {
2322         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2323           idx[r] = cstart + diagIdx[r];
2324         } else idx[r] = offdiagIdx[r];
2325       }
2326     } else {
2327       a[r] = offdiagA[r];
2328       if (idx) idx[r] = offdiagIdx[r];
2329     }
2330   }
2331   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2332   PetscCall(VecRestoreArrayWrite(v, &a));
2333   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2334   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2335   PetscCall(VecDestroy(&diagV));
2336   PetscCall(VecDestroy(&offdiagV));
2337   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2338   PetscFunctionReturn(PETSC_SUCCESS);
2339 }
2340 
2341 static PetscErrorCode MatGetRowMin_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2342 {
2343   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2344   PetscInt           m = A->rmap->n, n = A->cmap->n;
2345   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2346   PetscInt          *cmap = mat->garray;
2347   PetscInt          *diagIdx, *offdiagIdx;
2348   Vec                diagV, offdiagV;
2349   PetscScalar       *a, *diagA, *offdiagA;
2350   const PetscScalar *ba, *bav;
2351   PetscInt           r, j, col, ncols, *bi, *bj;
2352   Mat                B = mat->B;
2353   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2354 
2355   PetscFunctionBegin;
2356   /* When a process holds entire A and other processes have no entry */
2357   if (A->cmap->N == n) {
2358     PetscCall(VecGetArrayWrite(v, &diagA));
2359     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2360     PetscCall(MatGetRowMin(mat->A, diagV, idx));
2361     PetscCall(VecDestroy(&diagV));
2362     PetscCall(VecRestoreArrayWrite(v, &diagA));
2363     PetscFunctionReturn(PETSC_SUCCESS);
2364   } else if (n == 0) {
2365     if (m) {
2366       PetscCall(VecGetArrayWrite(v, &a));
2367       for (r = 0; r < m; r++) {
2368         a[r] = PETSC_MAX_REAL;
2369         if (idx) idx[r] = -1;
2370       }
2371       PetscCall(VecRestoreArrayWrite(v, &a));
2372     }
2373     PetscFunctionReturn(PETSC_SUCCESS);
2374   }
2375 
2376   PetscCall(PetscCalloc2(m, &diagIdx, m, &offdiagIdx));
2377   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2378   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2379   PetscCall(MatGetRowMin(mat->A, diagV, diagIdx));
2380 
2381   /* Get offdiagIdx[] for implicit 0.0 */
2382   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2383   ba = bav;
2384   bi = b->i;
2385   bj = b->j;
2386   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2387   for (r = 0; r < m; r++) {
2388     ncols = bi[r + 1] - bi[r];
2389     if (ncols == A->cmap->N - n) { /* Brow is dense */
2390       offdiagA[r]   = *ba;
2391       offdiagIdx[r] = cmap[0];
2392     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2393       offdiagA[r] = 0.0;
2394 
2395       /* Find first hole in the cmap */
2396       for (j = 0; j < ncols; j++) {
2397         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2398         if (col > j && j < cstart) {
2399           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2400           break;
2401         } else if (col > j + n && j >= cstart) {
2402           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2403           break;
2404         }
2405       }
2406       if (j == ncols && ncols < A->cmap->N - n) {
2407         /* a hole is outside compressed Bcols */
2408         if (ncols == 0) {
2409           if (cstart) {
2410             offdiagIdx[r] = 0;
2411           } else offdiagIdx[r] = cend;
2412         } else { /* ncols > 0 */
2413           offdiagIdx[r] = cmap[ncols - 1] + 1;
2414           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2415         }
2416       }
2417     }
2418 
2419     for (j = 0; j < ncols; j++) {
2420       if (PetscRealPart(offdiagA[r]) > PetscRealPart(*ba)) {
2421         offdiagA[r]   = *ba;
2422         offdiagIdx[r] = cmap[*bj];
2423       }
2424       ba++;
2425       bj++;
2426     }
2427   }
2428 
2429   PetscCall(VecGetArrayWrite(v, &a));
2430   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2431   for (r = 0; r < m; ++r) {
2432     if (PetscRealPart(diagA[r]) < PetscRealPart(offdiagA[r])) {
2433       a[r] = diagA[r];
2434       if (idx) idx[r] = cstart + diagIdx[r];
2435     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2436       a[r] = diagA[r];
2437       if (idx) {
2438         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2439           idx[r] = cstart + diagIdx[r];
2440         } else idx[r] = offdiagIdx[r];
2441       }
2442     } else {
2443       a[r] = offdiagA[r];
2444       if (idx) idx[r] = offdiagIdx[r];
2445     }
2446   }
2447   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2448   PetscCall(VecRestoreArrayWrite(v, &a));
2449   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2450   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2451   PetscCall(VecDestroy(&diagV));
2452   PetscCall(VecDestroy(&offdiagV));
2453   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2454   PetscFunctionReturn(PETSC_SUCCESS);
2455 }
2456 
2457 static PetscErrorCode MatGetRowMax_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2458 {
2459   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2460   PetscInt           m = A->rmap->n, n = A->cmap->n;
2461   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2462   PetscInt          *cmap = mat->garray;
2463   PetscInt          *diagIdx, *offdiagIdx;
2464   Vec                diagV, offdiagV;
2465   PetscScalar       *a, *diagA, *offdiagA;
2466   const PetscScalar *ba, *bav;
2467   PetscInt           r, j, col, ncols, *bi, *bj;
2468   Mat                B = mat->B;
2469   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2470 
2471   PetscFunctionBegin;
2472   /* When a process holds entire A and other processes have no entry */
2473   if (A->cmap->N == n) {
2474     PetscCall(VecGetArrayWrite(v, &diagA));
2475     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2476     PetscCall(MatGetRowMax(mat->A, diagV, idx));
2477     PetscCall(VecDestroy(&diagV));
2478     PetscCall(VecRestoreArrayWrite(v, &diagA));
2479     PetscFunctionReturn(PETSC_SUCCESS);
2480   } else if (n == 0) {
2481     if (m) {
2482       PetscCall(VecGetArrayWrite(v, &a));
2483       for (r = 0; r < m; r++) {
2484         a[r] = PETSC_MIN_REAL;
2485         if (idx) idx[r] = -1;
2486       }
2487       PetscCall(VecRestoreArrayWrite(v, &a));
2488     }
2489     PetscFunctionReturn(PETSC_SUCCESS);
2490   }
2491 
2492   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2493   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2494   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2495   PetscCall(MatGetRowMax(mat->A, diagV, diagIdx));
2496 
2497   /* Get offdiagIdx[] for implicit 0.0 */
2498   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2499   ba = bav;
2500   bi = b->i;
2501   bj = b->j;
2502   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2503   for (r = 0; r < m; r++) {
2504     ncols = bi[r + 1] - bi[r];
2505     if (ncols == A->cmap->N - n) { /* Brow is dense */
2506       offdiagA[r]   = *ba;
2507       offdiagIdx[r] = cmap[0];
2508     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2509       offdiagA[r] = 0.0;
2510 
2511       /* Find first hole in the cmap */
2512       for (j = 0; j < ncols; j++) {
2513         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2514         if (col > j && j < cstart) {
2515           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2516           break;
2517         } else if (col > j + n && j >= cstart) {
2518           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2519           break;
2520         }
2521       }
2522       if (j == ncols && ncols < A->cmap->N - n) {
2523         /* a hole is outside compressed Bcols */
2524         if (ncols == 0) {
2525           if (cstart) {
2526             offdiagIdx[r] = 0;
2527           } else offdiagIdx[r] = cend;
2528         } else { /* ncols > 0 */
2529           offdiagIdx[r] = cmap[ncols - 1] + 1;
2530           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2531         }
2532       }
2533     }
2534 
2535     for (j = 0; j < ncols; j++) {
2536       if (PetscRealPart(offdiagA[r]) < PetscRealPart(*ba)) {
2537         offdiagA[r]   = *ba;
2538         offdiagIdx[r] = cmap[*bj];
2539       }
2540       ba++;
2541       bj++;
2542     }
2543   }
2544 
2545   PetscCall(VecGetArrayWrite(v, &a));
2546   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2547   for (r = 0; r < m; ++r) {
2548     if (PetscRealPart(diagA[r]) > PetscRealPart(offdiagA[r])) {
2549       a[r] = diagA[r];
2550       if (idx) idx[r] = cstart + diagIdx[r];
2551     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2552       a[r] = diagA[r];
2553       if (idx) {
2554         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2555           idx[r] = cstart + diagIdx[r];
2556         } else idx[r] = offdiagIdx[r];
2557       }
2558     } else {
2559       a[r] = offdiagA[r];
2560       if (idx) idx[r] = offdiagIdx[r];
2561     }
2562   }
2563   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2564   PetscCall(VecRestoreArrayWrite(v, &a));
2565   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2566   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2567   PetscCall(VecDestroy(&diagV));
2568   PetscCall(VecDestroy(&offdiagV));
2569   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2570   PetscFunctionReturn(PETSC_SUCCESS);
2571 }
2572 
2573 PetscErrorCode MatGetSeqNonzeroStructure_MPIAIJ(Mat mat, Mat *newmat)
2574 {
2575   Mat *dummy;
2576 
2577   PetscFunctionBegin;
2578   PetscCall(MatCreateSubMatrix_MPIAIJ_All(mat, MAT_DO_NOT_GET_VALUES, MAT_INITIAL_MATRIX, &dummy));
2579   *newmat = *dummy;
2580   PetscCall(PetscFree(dummy));
2581   PetscFunctionReturn(PETSC_SUCCESS);
2582 }
2583 
2584 static PetscErrorCode MatInvertBlockDiagonal_MPIAIJ(Mat A, const PetscScalar **values)
2585 {
2586   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2587 
2588   PetscFunctionBegin;
2589   PetscCall(MatInvertBlockDiagonal(a->A, values));
2590   A->factorerrortype = a->A->factorerrortype;
2591   PetscFunctionReturn(PETSC_SUCCESS);
2592 }
2593 
2594 static PetscErrorCode MatSetRandom_MPIAIJ(Mat x, PetscRandom rctx)
2595 {
2596   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)x->data;
2597 
2598   PetscFunctionBegin;
2599   PetscCheck(x->assembled || x->preallocated, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "MatSetRandom on an unassembled and unpreallocated MATMPIAIJ is not allowed");
2600   PetscCall(MatSetRandom(aij->A, rctx));
2601   if (x->assembled) {
2602     PetscCall(MatSetRandom(aij->B, rctx));
2603   } else {
2604     PetscCall(MatSetRandomSkipColumnRange_SeqAIJ_Private(aij->B, x->cmap->rstart, x->cmap->rend, rctx));
2605   }
2606   PetscCall(MatAssemblyBegin(x, MAT_FINAL_ASSEMBLY));
2607   PetscCall(MatAssemblyEnd(x, MAT_FINAL_ASSEMBLY));
2608   PetscFunctionReturn(PETSC_SUCCESS);
2609 }
2610 
2611 static PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ(Mat A, PetscBool sc)
2612 {
2613   PetscFunctionBegin;
2614   if (sc) A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ_Scalable;
2615   else A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ;
2616   PetscFunctionReturn(PETSC_SUCCESS);
2617 }
2618 
2619 /*@
2620   MatMPIAIJGetNumberNonzeros - gets the number of nonzeros in the matrix on this MPI rank
2621 
2622   Not Collective
2623 
2624   Input Parameter:
2625 . A - the matrix
2626 
2627   Output Parameter:
2628 . nz - the number of nonzeros
2629 
2630   Level: advanced
2631 
2632 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2633 @*/
2634 PetscErrorCode MatMPIAIJGetNumberNonzeros(Mat A, PetscCount *nz)
2635 {
2636   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)A->data;
2637   Mat_SeqAIJ *aaij = (Mat_SeqAIJ *)maij->A->data, *baij = (Mat_SeqAIJ *)maij->B->data;
2638   PetscBool   isaij;
2639 
2640   PetscFunctionBegin;
2641   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIAIJ, &isaij));
2642   PetscCheck(isaij, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
2643   *nz = aaij->i[A->rmap->n] + baij->i[A->rmap->n];
2644   PetscFunctionReturn(PETSC_SUCCESS);
2645 }
2646 
2647 /*@
2648   MatMPIAIJSetUseScalableIncreaseOverlap - Determine if the matrix uses a scalable algorithm to compute the overlap
2649 
2650   Collective
2651 
2652   Input Parameters:
2653 + A  - the matrix
2654 - sc - `PETSC_TRUE` indicates use the scalable algorithm (default is not to use the scalable algorithm)
2655 
2656   Level: advanced
2657 
2658 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2659 @*/
2660 PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap(Mat A, PetscBool sc)
2661 {
2662   PetscFunctionBegin;
2663   PetscTryMethod(A, "MatMPIAIJSetUseScalableIncreaseOverlap_C", (Mat, PetscBool), (A, sc));
2664   PetscFunctionReturn(PETSC_SUCCESS);
2665 }
2666 
2667 PetscErrorCode MatSetFromOptions_MPIAIJ(Mat A, PetscOptionItems *PetscOptionsObject)
2668 {
2669   PetscBool sc = PETSC_FALSE, flg;
2670 
2671   PetscFunctionBegin;
2672   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJ options");
2673   if (A->ops->increaseoverlap == MatIncreaseOverlap_MPIAIJ_Scalable) sc = PETSC_TRUE;
2674   PetscCall(PetscOptionsBool("-mat_increase_overlap_scalable", "Use a scalable algorithm to compute the overlap", "MatIncreaseOverlap", sc, &sc, &flg));
2675   if (flg) PetscCall(MatMPIAIJSetUseScalableIncreaseOverlap(A, sc));
2676   PetscOptionsHeadEnd();
2677   PetscFunctionReturn(PETSC_SUCCESS);
2678 }
2679 
2680 static PetscErrorCode MatShift_MPIAIJ(Mat Y, PetscScalar a)
2681 {
2682   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)Y->data;
2683   Mat_SeqAIJ *aij  = (Mat_SeqAIJ *)maij->A->data;
2684 
2685   PetscFunctionBegin;
2686   if (!Y->preallocated) {
2687     PetscCall(MatMPIAIJSetPreallocation(Y, 1, NULL, 0, NULL));
2688   } else if (!aij->nz) { /* It does not matter if diagonals of Y only partially lie in maij->A. We just need an estimated preallocation. */
2689     PetscInt nonew = aij->nonew;
2690     PetscCall(MatSeqAIJSetPreallocation(maij->A, 1, NULL));
2691     aij->nonew = nonew;
2692   }
2693   PetscCall(MatShift_Basic(Y, a));
2694   PetscFunctionReturn(PETSC_SUCCESS);
2695 }
2696 
2697 static PetscErrorCode MatMissingDiagonal_MPIAIJ(Mat A, PetscBool *missing, PetscInt *d)
2698 {
2699   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2700 
2701   PetscFunctionBegin;
2702   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only works for square matrices");
2703   PetscCall(MatMissingDiagonal(a->A, missing, d));
2704   if (d) {
2705     PetscInt rstart;
2706     PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
2707     *d += rstart;
2708   }
2709   PetscFunctionReturn(PETSC_SUCCESS);
2710 }
2711 
2712 static PetscErrorCode MatInvertVariableBlockDiagonal_MPIAIJ(Mat A, PetscInt nblocks, const PetscInt *bsizes, PetscScalar *diag)
2713 {
2714   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2715 
2716   PetscFunctionBegin;
2717   PetscCall(MatInvertVariableBlockDiagonal(a->A, nblocks, bsizes, diag));
2718   PetscFunctionReturn(PETSC_SUCCESS);
2719 }
2720 
2721 static PetscErrorCode MatEliminateZeros_MPIAIJ(Mat A, PetscBool keep)
2722 {
2723   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2724 
2725   PetscFunctionBegin;
2726   PetscCall(MatEliminateZeros_SeqAIJ(a->A, keep));        // possibly keep zero diagonal coefficients
2727   PetscCall(MatEliminateZeros_SeqAIJ(a->B, PETSC_FALSE)); // never keep zero diagonal coefficients
2728   PetscFunctionReturn(PETSC_SUCCESS);
2729 }
2730 
2731 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
2732                                        MatGetRow_MPIAIJ,
2733                                        MatRestoreRow_MPIAIJ,
2734                                        MatMult_MPIAIJ,
2735                                        /* 4*/ MatMultAdd_MPIAIJ,
2736                                        MatMultTranspose_MPIAIJ,
2737                                        MatMultTransposeAdd_MPIAIJ,
2738                                        NULL,
2739                                        NULL,
2740                                        NULL,
2741                                        /*10*/ NULL,
2742                                        NULL,
2743                                        NULL,
2744                                        MatSOR_MPIAIJ,
2745                                        MatTranspose_MPIAIJ,
2746                                        /*15*/ MatGetInfo_MPIAIJ,
2747                                        MatEqual_MPIAIJ,
2748                                        MatGetDiagonal_MPIAIJ,
2749                                        MatDiagonalScale_MPIAIJ,
2750                                        MatNorm_MPIAIJ,
2751                                        /*20*/ MatAssemblyBegin_MPIAIJ,
2752                                        MatAssemblyEnd_MPIAIJ,
2753                                        MatSetOption_MPIAIJ,
2754                                        MatZeroEntries_MPIAIJ,
2755                                        /*24*/ MatZeroRows_MPIAIJ,
2756                                        NULL,
2757                                        NULL,
2758                                        NULL,
2759                                        NULL,
2760                                        /*29*/ MatSetUp_MPI_Hash,
2761                                        NULL,
2762                                        NULL,
2763                                        MatGetDiagonalBlock_MPIAIJ,
2764                                        NULL,
2765                                        /*34*/ MatDuplicate_MPIAIJ,
2766                                        NULL,
2767                                        NULL,
2768                                        NULL,
2769                                        NULL,
2770                                        /*39*/ MatAXPY_MPIAIJ,
2771                                        MatCreateSubMatrices_MPIAIJ,
2772                                        MatIncreaseOverlap_MPIAIJ,
2773                                        MatGetValues_MPIAIJ,
2774                                        MatCopy_MPIAIJ,
2775                                        /*44*/ MatGetRowMax_MPIAIJ,
2776                                        MatScale_MPIAIJ,
2777                                        MatShift_MPIAIJ,
2778                                        MatDiagonalSet_MPIAIJ,
2779                                        MatZeroRowsColumns_MPIAIJ,
2780                                        /*49*/ MatSetRandom_MPIAIJ,
2781                                        MatGetRowIJ_MPIAIJ,
2782                                        MatRestoreRowIJ_MPIAIJ,
2783                                        NULL,
2784                                        NULL,
2785                                        /*54*/ MatFDColoringCreate_MPIXAIJ,
2786                                        NULL,
2787                                        MatSetUnfactored_MPIAIJ,
2788                                        MatPermute_MPIAIJ,
2789                                        NULL,
2790                                        /*59*/ MatCreateSubMatrix_MPIAIJ,
2791                                        MatDestroy_MPIAIJ,
2792                                        MatView_MPIAIJ,
2793                                        NULL,
2794                                        NULL,
2795                                        /*64*/ NULL,
2796                                        MatMatMatMultNumeric_MPIAIJ_MPIAIJ_MPIAIJ,
2797                                        NULL,
2798                                        NULL,
2799                                        NULL,
2800                                        /*69*/ MatGetRowMaxAbs_MPIAIJ,
2801                                        MatGetRowMinAbs_MPIAIJ,
2802                                        NULL,
2803                                        NULL,
2804                                        NULL,
2805                                        NULL,
2806                                        /*75*/ MatFDColoringApply_AIJ,
2807                                        MatSetFromOptions_MPIAIJ,
2808                                        NULL,
2809                                        NULL,
2810                                        MatFindZeroDiagonals_MPIAIJ,
2811                                        /*80*/ NULL,
2812                                        NULL,
2813                                        NULL,
2814                                        /*83*/ MatLoad_MPIAIJ,
2815                                        NULL,
2816                                        NULL,
2817                                        NULL,
2818                                        NULL,
2819                                        NULL,
2820                                        /*89*/ NULL,
2821                                        NULL,
2822                                        MatMatMultNumeric_MPIAIJ_MPIAIJ,
2823                                        NULL,
2824                                        NULL,
2825                                        /*94*/ MatPtAPNumeric_MPIAIJ_MPIAIJ,
2826                                        NULL,
2827                                        NULL,
2828                                        NULL,
2829                                        MatBindToCPU_MPIAIJ,
2830                                        /*99*/ MatProductSetFromOptions_MPIAIJ,
2831                                        NULL,
2832                                        NULL,
2833                                        MatConjugate_MPIAIJ,
2834                                        NULL,
2835                                        /*104*/ MatSetValuesRow_MPIAIJ,
2836                                        MatRealPart_MPIAIJ,
2837                                        MatImaginaryPart_MPIAIJ,
2838                                        NULL,
2839                                        NULL,
2840                                        /*109*/ NULL,
2841                                        NULL,
2842                                        MatGetRowMin_MPIAIJ,
2843                                        NULL,
2844                                        MatMissingDiagonal_MPIAIJ,
2845                                        /*114*/ MatGetSeqNonzeroStructure_MPIAIJ,
2846                                        NULL,
2847                                        MatGetGhosts_MPIAIJ,
2848                                        NULL,
2849                                        NULL,
2850                                        /*119*/ MatMultDiagonalBlock_MPIAIJ,
2851                                        NULL,
2852                                        NULL,
2853                                        NULL,
2854                                        MatGetMultiProcBlock_MPIAIJ,
2855                                        /*124*/ MatFindNonzeroRows_MPIAIJ,
2856                                        MatGetColumnReductions_MPIAIJ,
2857                                        MatInvertBlockDiagonal_MPIAIJ,
2858                                        MatInvertVariableBlockDiagonal_MPIAIJ,
2859                                        MatCreateSubMatricesMPI_MPIAIJ,
2860                                        /*129*/ NULL,
2861                                        NULL,
2862                                        NULL,
2863                                        MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ,
2864                                        NULL,
2865                                        /*134*/ NULL,
2866                                        NULL,
2867                                        NULL,
2868                                        NULL,
2869                                        NULL,
2870                                        /*139*/ MatSetBlockSizes_MPIAIJ,
2871                                        NULL,
2872                                        NULL,
2873                                        MatFDColoringSetUp_MPIXAIJ,
2874                                        MatFindOffBlockDiagonalEntries_MPIAIJ,
2875                                        MatCreateMPIMatConcatenateSeqMat_MPIAIJ,
2876                                        /*145*/ NULL,
2877                                        NULL,
2878                                        NULL,
2879                                        MatCreateGraph_Simple_AIJ,
2880                                        NULL,
2881                                        /*150*/ NULL,
2882                                        MatEliminateZeros_MPIAIJ,
2883                                        MatGetRowSumAbs_MPIAIJ,
2884                                        NULL,
2885                                        NULL,
2886                                        /*155*/ NULL,
2887                                        MatCopyHashToXAIJ_MPI_Hash};
2888 
2889 static PetscErrorCode MatStoreValues_MPIAIJ(Mat mat)
2890 {
2891   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2892 
2893   PetscFunctionBegin;
2894   PetscCall(MatStoreValues(aij->A));
2895   PetscCall(MatStoreValues(aij->B));
2896   PetscFunctionReturn(PETSC_SUCCESS);
2897 }
2898 
2899 static PetscErrorCode MatRetrieveValues_MPIAIJ(Mat mat)
2900 {
2901   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2902 
2903   PetscFunctionBegin;
2904   PetscCall(MatRetrieveValues(aij->A));
2905   PetscCall(MatRetrieveValues(aij->B));
2906   PetscFunctionReturn(PETSC_SUCCESS);
2907 }
2908 
2909 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJ(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
2910 {
2911   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2912   PetscMPIInt size;
2913 
2914   PetscFunctionBegin;
2915   if (B->hash_active) {
2916     B->ops[0]      = b->cops;
2917     B->hash_active = PETSC_FALSE;
2918   }
2919   PetscCall(PetscLayoutSetUp(B->rmap));
2920   PetscCall(PetscLayoutSetUp(B->cmap));
2921 
2922 #if defined(PETSC_USE_CTABLE)
2923   PetscCall(PetscHMapIDestroy(&b->colmap));
2924 #else
2925   PetscCall(PetscFree(b->colmap));
2926 #endif
2927   PetscCall(PetscFree(b->garray));
2928   PetscCall(VecDestroy(&b->lvec));
2929   PetscCall(VecScatterDestroy(&b->Mvctx));
2930 
2931   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2932 
2933   MatSeqXAIJGetOptions_Private(b->B);
2934   PetscCall(MatDestroy(&b->B));
2935   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2936   PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
2937   PetscCall(MatSetBlockSizesFromMats(b->B, B, B));
2938   PetscCall(MatSetType(b->B, MATSEQAIJ));
2939   MatSeqXAIJRestoreOptions_Private(b->B);
2940 
2941   MatSeqXAIJGetOptions_Private(b->A);
2942   PetscCall(MatDestroy(&b->A));
2943   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2944   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2945   PetscCall(MatSetBlockSizesFromMats(b->A, B, B));
2946   PetscCall(MatSetType(b->A, MATSEQAIJ));
2947   MatSeqXAIJRestoreOptions_Private(b->A);
2948 
2949   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
2950   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
2951   B->preallocated  = PETSC_TRUE;
2952   B->was_assembled = PETSC_FALSE;
2953   B->assembled     = PETSC_FALSE;
2954   PetscFunctionReturn(PETSC_SUCCESS);
2955 }
2956 
2957 static PetscErrorCode MatResetPreallocation_MPIAIJ(Mat B)
2958 {
2959   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2960 
2961   PetscFunctionBegin;
2962   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
2963   PetscCall(PetscLayoutSetUp(B->rmap));
2964   PetscCall(PetscLayoutSetUp(B->cmap));
2965   if (B->assembled || B->was_assembled) PetscCall(MatDisAssemble_MPIAIJ(B, PETSC_TRUE));
2966   else {
2967 #if defined(PETSC_USE_CTABLE)
2968     PetscCall(PetscHMapIDestroy(&b->colmap));
2969 #else
2970     PetscCall(PetscFree(b->colmap));
2971 #endif
2972     PetscCall(PetscFree(b->garray));
2973     PetscCall(VecDestroy(&b->lvec));
2974   }
2975   PetscCall(VecScatterDestroy(&b->Mvctx));
2976 
2977   PetscCall(MatResetPreallocation(b->A));
2978   PetscCall(MatResetPreallocation(b->B));
2979   B->preallocated  = PETSC_TRUE;
2980   B->was_assembled = PETSC_FALSE;
2981   B->assembled     = PETSC_FALSE;
2982   PetscFunctionReturn(PETSC_SUCCESS);
2983 }
2984 
2985 PetscErrorCode MatDuplicate_MPIAIJ(Mat matin, MatDuplicateOption cpvalues, Mat *newmat)
2986 {
2987   Mat         mat;
2988   Mat_MPIAIJ *a, *oldmat = (Mat_MPIAIJ *)matin->data;
2989 
2990   PetscFunctionBegin;
2991   *newmat = NULL;
2992   PetscCall(MatCreate(PetscObjectComm((PetscObject)matin), &mat));
2993   PetscCall(MatSetSizes(mat, matin->rmap->n, matin->cmap->n, matin->rmap->N, matin->cmap->N));
2994   PetscCall(MatSetBlockSizesFromMats(mat, matin, matin));
2995   PetscCall(MatSetType(mat, ((PetscObject)matin)->type_name));
2996   a = (Mat_MPIAIJ *)mat->data;
2997 
2998   mat->factortype = matin->factortype;
2999   mat->assembled  = matin->assembled;
3000   mat->insertmode = NOT_SET_VALUES;
3001 
3002   a->size         = oldmat->size;
3003   a->rank         = oldmat->rank;
3004   a->donotstash   = oldmat->donotstash;
3005   a->roworiented  = oldmat->roworiented;
3006   a->rowindices   = NULL;
3007   a->rowvalues    = NULL;
3008   a->getrowactive = PETSC_FALSE;
3009 
3010   PetscCall(PetscLayoutReference(matin->rmap, &mat->rmap));
3011   PetscCall(PetscLayoutReference(matin->cmap, &mat->cmap));
3012   if (matin->hash_active) {
3013     PetscCall(MatSetUp(mat));
3014   } else {
3015     mat->preallocated = matin->preallocated;
3016     if (oldmat->colmap) {
3017 #if defined(PETSC_USE_CTABLE)
3018       PetscCall(PetscHMapIDuplicate(oldmat->colmap, &a->colmap));
3019 #else
3020       PetscCall(PetscMalloc1(mat->cmap->N, &a->colmap));
3021       PetscCall(PetscArraycpy(a->colmap, oldmat->colmap, mat->cmap->N));
3022 #endif
3023     } else a->colmap = NULL;
3024     if (oldmat->garray) {
3025       PetscInt len;
3026       len = oldmat->B->cmap->n;
3027       PetscCall(PetscMalloc1(len + 1, &a->garray));
3028       if (len) PetscCall(PetscArraycpy(a->garray, oldmat->garray, len));
3029     } else a->garray = NULL;
3030 
3031     /* It may happen MatDuplicate is called with a non-assembled matrix
3032       In fact, MatDuplicate only requires the matrix to be preallocated
3033       This may happen inside a DMCreateMatrix_Shell */
3034     if (oldmat->lvec) PetscCall(VecDuplicate(oldmat->lvec, &a->lvec));
3035     if (oldmat->Mvctx) {
3036       a->Mvctx = oldmat->Mvctx;
3037       PetscCall(PetscObjectReference((PetscObject)oldmat->Mvctx));
3038     }
3039     PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
3040     PetscCall(MatDuplicate(oldmat->B, cpvalues, &a->B));
3041   }
3042   PetscCall(PetscFunctionListDuplicate(((PetscObject)matin)->qlist, &((PetscObject)mat)->qlist));
3043   *newmat = mat;
3044   PetscFunctionReturn(PETSC_SUCCESS);
3045 }
3046 
3047 PetscErrorCode MatLoad_MPIAIJ(Mat newMat, PetscViewer viewer)
3048 {
3049   PetscBool isbinary, ishdf5;
3050 
3051   PetscFunctionBegin;
3052   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
3053   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
3054   /* force binary viewer to load .info file if it has not yet done so */
3055   PetscCall(PetscViewerSetUp(viewer));
3056   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3057   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
3058   if (isbinary) {
3059     PetscCall(MatLoad_MPIAIJ_Binary(newMat, viewer));
3060   } else if (ishdf5) {
3061 #if defined(PETSC_HAVE_HDF5)
3062     PetscCall(MatLoad_AIJ_HDF5(newMat, viewer));
3063 #else
3064     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "HDF5 not supported in this build.\nPlease reconfigure using --download-hdf5");
3065 #endif
3066   } else {
3067     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
3068   }
3069   PetscFunctionReturn(PETSC_SUCCESS);
3070 }
3071 
3072 PetscErrorCode MatLoad_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
3073 {
3074   PetscInt     header[4], M, N, m, nz, rows, cols, sum, i;
3075   PetscInt    *rowidxs, *colidxs;
3076   PetscScalar *matvals;
3077 
3078   PetscFunctionBegin;
3079   PetscCall(PetscViewerSetUp(viewer));
3080 
3081   /* read in matrix header */
3082   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3083   PetscCheck(header[0] == MAT_FILE_CLASSID, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3084   M  = header[1];
3085   N  = header[2];
3086   nz = header[3];
3087   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3088   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3089   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as MPIAIJ");
3090 
3091   /* set block sizes from the viewer's .info file */
3092   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3093   /* set global sizes if not set already */
3094   if (mat->rmap->N < 0) mat->rmap->N = M;
3095   if (mat->cmap->N < 0) mat->cmap->N = N;
3096   PetscCall(PetscLayoutSetUp(mat->rmap));
3097   PetscCall(PetscLayoutSetUp(mat->cmap));
3098 
3099   /* check if the matrix sizes are correct */
3100   PetscCall(MatGetSize(mat, &rows, &cols));
3101   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3102 
3103   /* read in row lengths and build row indices */
3104   PetscCall(MatGetLocalSize(mat, &m, NULL));
3105   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3106   PetscCall(PetscViewerBinaryReadAll(viewer, rowidxs + 1, m, PETSC_DECIDE, M, PETSC_INT));
3107   rowidxs[0] = 0;
3108   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3109   if (nz != PETSC_INT_MAX) {
3110     PetscCallMPI(MPIU_Allreduce(&rowidxs[m], &sum, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)viewer)));
3111     PetscCheck(sum == nz, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3112   }
3113 
3114   /* read in column indices and matrix values */
3115   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, rowidxs[m], &matvals));
3116   PetscCall(PetscViewerBinaryReadAll(viewer, colidxs, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
3117   PetscCall(PetscViewerBinaryReadAll(viewer, matvals, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
3118   /* store matrix indices and values */
3119   PetscCall(MatMPIAIJSetPreallocationCSR(mat, rowidxs, colidxs, matvals));
3120   PetscCall(PetscFree(rowidxs));
3121   PetscCall(PetscFree2(colidxs, matvals));
3122   PetscFunctionReturn(PETSC_SUCCESS);
3123 }
3124 
3125 /* Not scalable because of ISAllGather() unless getting all columns. */
3126 static PetscErrorCode ISGetSeqIS_Private(Mat mat, IS iscol, IS *isseq)
3127 {
3128   IS          iscol_local;
3129   PetscBool   isstride;
3130   PetscMPIInt gisstride = 0;
3131 
3132   PetscFunctionBegin;
3133   /* check if we are grabbing all columns*/
3134   PetscCall(PetscObjectTypeCompare((PetscObject)iscol, ISSTRIDE, &isstride));
3135 
3136   if (isstride) {
3137     PetscInt start, len, mstart, mlen;
3138     PetscCall(ISStrideGetInfo(iscol, &start, NULL));
3139     PetscCall(ISGetLocalSize(iscol, &len));
3140     PetscCall(MatGetOwnershipRangeColumn(mat, &mstart, &mlen));
3141     if (mstart == start && mlen - mstart == len) gisstride = 1;
3142   }
3143 
3144   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &gisstride, 1, MPI_INT, MPI_MIN, PetscObjectComm((PetscObject)mat)));
3145   if (gisstride) {
3146     PetscInt N;
3147     PetscCall(MatGetSize(mat, NULL, &N));
3148     PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &iscol_local));
3149     PetscCall(ISSetIdentity(iscol_local));
3150     PetscCall(PetscInfo(mat, "Optimizing for obtaining all columns of the matrix; skipping ISAllGather()\n"));
3151   } else {
3152     PetscInt cbs;
3153     PetscCall(ISGetBlockSize(iscol, &cbs));
3154     PetscCall(ISAllGather(iscol, &iscol_local));
3155     PetscCall(ISSetBlockSize(iscol_local, cbs));
3156   }
3157 
3158   *isseq = iscol_local;
3159   PetscFunctionReturn(PETSC_SUCCESS);
3160 }
3161 
3162 /*
3163  Used by MatCreateSubMatrix_MPIAIJ_SameRowColDist() to avoid ISAllGather() and global size of iscol_local
3164  (see MatCreateSubMatrix_MPIAIJ_nonscalable)
3165 
3166  Input Parameters:
3167 +   mat - matrix
3168 .   isrow - parallel row index set; its local indices are a subset of local columns of `mat`,
3169            i.e., mat->rstart <= isrow[i] < mat->rend
3170 -   iscol - parallel column index set; its local indices are a subset of local columns of `mat`,
3171            i.e., mat->cstart <= iscol[i] < mat->cend
3172 
3173  Output Parameters:
3174 +   isrow_d - sequential row index set for retrieving mat->A
3175 .   iscol_d - sequential  column index set for retrieving mat->A
3176 .   iscol_o - sequential column index set for retrieving mat->B
3177 -   garray - column map; garray[i] indicates global location of iscol_o[i] in `iscol`
3178  */
3179 static PetscErrorCode ISGetSeqIS_SameColDist_Private(Mat mat, IS isrow, IS iscol, IS *isrow_d, IS *iscol_d, IS *iscol_o, PetscInt *garray[])
3180 {
3181   Vec             x, cmap;
3182   const PetscInt *is_idx;
3183   PetscScalar    *xarray, *cmaparray;
3184   PetscInt        ncols, isstart, *idx, m, rstart, *cmap1, count;
3185   Mat_MPIAIJ     *a    = (Mat_MPIAIJ *)mat->data;
3186   Mat             B    = a->B;
3187   Vec             lvec = a->lvec, lcmap;
3188   PetscInt        i, cstart, cend, Bn = B->cmap->N;
3189   MPI_Comm        comm;
3190   VecScatter      Mvctx = a->Mvctx;
3191 
3192   PetscFunctionBegin;
3193   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3194   PetscCall(ISGetLocalSize(iscol, &ncols));
3195 
3196   /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */
3197   PetscCall(MatCreateVecs(mat, &x, NULL));
3198   PetscCall(VecSet(x, -1.0));
3199   PetscCall(VecDuplicate(x, &cmap));
3200   PetscCall(VecSet(cmap, -1.0));
3201 
3202   /* Get start indices */
3203   PetscCallMPI(MPI_Scan(&ncols, &isstart, 1, MPIU_INT, MPI_SUM, comm));
3204   isstart -= ncols;
3205   PetscCall(MatGetOwnershipRangeColumn(mat, &cstart, &cend));
3206 
3207   PetscCall(ISGetIndices(iscol, &is_idx));
3208   PetscCall(VecGetArray(x, &xarray));
3209   PetscCall(VecGetArray(cmap, &cmaparray));
3210   PetscCall(PetscMalloc1(ncols, &idx));
3211   for (i = 0; i < ncols; i++) {
3212     xarray[is_idx[i] - cstart]    = (PetscScalar)is_idx[i];
3213     cmaparray[is_idx[i] - cstart] = i + isstart;        /* global index of iscol[i] */
3214     idx[i]                        = is_idx[i] - cstart; /* local index of iscol[i]  */
3215   }
3216   PetscCall(VecRestoreArray(x, &xarray));
3217   PetscCall(VecRestoreArray(cmap, &cmaparray));
3218   PetscCall(ISRestoreIndices(iscol, &is_idx));
3219 
3220   /* Get iscol_d */
3221   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, iscol_d));
3222   PetscCall(ISGetBlockSize(iscol, &i));
3223   PetscCall(ISSetBlockSize(*iscol_d, i));
3224 
3225   /* Get isrow_d */
3226   PetscCall(ISGetLocalSize(isrow, &m));
3227   rstart = mat->rmap->rstart;
3228   PetscCall(PetscMalloc1(m, &idx));
3229   PetscCall(ISGetIndices(isrow, &is_idx));
3230   for (i = 0; i < m; i++) idx[i] = is_idx[i] - rstart;
3231   PetscCall(ISRestoreIndices(isrow, &is_idx));
3232 
3233   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, m, idx, PETSC_OWN_POINTER, isrow_d));
3234   PetscCall(ISGetBlockSize(isrow, &i));
3235   PetscCall(ISSetBlockSize(*isrow_d, i));
3236 
3237   /* (2) Scatter x and cmap using aij->Mvctx to get their off-process portions (see MatMult_MPIAIJ) */
3238   PetscCall(VecScatterBegin(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3239   PetscCall(VecScatterEnd(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3240 
3241   PetscCall(VecDuplicate(lvec, &lcmap));
3242 
3243   PetscCall(VecScatterBegin(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3244   PetscCall(VecScatterEnd(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3245 
3246   /* (3) create sequential iscol_o (a subset of iscol) and isgarray */
3247   /* off-process column indices */
3248   count = 0;
3249   PetscCall(PetscMalloc1(Bn, &idx));
3250   PetscCall(PetscMalloc1(Bn, &cmap1));
3251 
3252   PetscCall(VecGetArray(lvec, &xarray));
3253   PetscCall(VecGetArray(lcmap, &cmaparray));
3254   for (i = 0; i < Bn; i++) {
3255     if (PetscRealPart(xarray[i]) > -1.0) {
3256       idx[count]   = i;                                     /* local column index in off-diagonal part B */
3257       cmap1[count] = (PetscInt)PetscRealPart(cmaparray[i]); /* column index in submat */
3258       count++;
3259     }
3260   }
3261   PetscCall(VecRestoreArray(lvec, &xarray));
3262   PetscCall(VecRestoreArray(lcmap, &cmaparray));
3263 
3264   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_COPY_VALUES, iscol_o));
3265   /* cannot ensure iscol_o has same blocksize as iscol! */
3266 
3267   PetscCall(PetscFree(idx));
3268   *garray = cmap1;
3269 
3270   PetscCall(VecDestroy(&x));
3271   PetscCall(VecDestroy(&cmap));
3272   PetscCall(VecDestroy(&lcmap));
3273   PetscFunctionReturn(PETSC_SUCCESS);
3274 }
3275 
3276 /* isrow and iscol have same processor distribution as mat, output *submat is a submatrix of local mat */
3277 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowColDist(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *submat)
3278 {
3279   Mat_MPIAIJ *a = (Mat_MPIAIJ *)mat->data, *asub;
3280   Mat         M = NULL;
3281   MPI_Comm    comm;
3282   IS          iscol_d, isrow_d, iscol_o;
3283   Mat         Asub = NULL, Bsub = NULL;
3284   PetscInt    n;
3285 
3286   PetscFunctionBegin;
3287   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3288 
3289   if (call == MAT_REUSE_MATRIX) {
3290     /* Retrieve isrow_d, iscol_d and iscol_o from submat */
3291     PetscCall(PetscObjectQuery((PetscObject)*submat, "isrow_d", (PetscObject *)&isrow_d));
3292     PetscCheck(isrow_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "isrow_d passed in was not used before, cannot reuse");
3293 
3294     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_d", (PetscObject *)&iscol_d));
3295     PetscCheck(iscol_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_d passed in was not used before, cannot reuse");
3296 
3297     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_o", (PetscObject *)&iscol_o));
3298     PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse");
3299 
3300     /* Update diagonal and off-diagonal portions of submat */
3301     asub = (Mat_MPIAIJ *)(*submat)->data;
3302     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->A));
3303     PetscCall(ISGetLocalSize(iscol_o, &n));
3304     if (n) PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->B));
3305     PetscCall(MatAssemblyBegin(*submat, MAT_FINAL_ASSEMBLY));
3306     PetscCall(MatAssemblyEnd(*submat, MAT_FINAL_ASSEMBLY));
3307 
3308   } else { /* call == MAT_INITIAL_MATRIX) */
3309     PetscInt *garray;
3310     PetscInt  BsubN;
3311 
3312     /* Create isrow_d, iscol_d, iscol_o and isgarray (replace isgarray with array?) */
3313     PetscCall(ISGetSeqIS_SameColDist_Private(mat, isrow, iscol, &isrow_d, &iscol_d, &iscol_o, &garray));
3314 
3315     /* Create local submatrices Asub and Bsub */
3316     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Asub));
3317     PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Bsub));
3318 
3319     /* Create submatrix M */
3320     PetscCall(MatCreateMPIAIJWithSeqAIJ(comm, Asub, Bsub, garray, &M));
3321 
3322     /* If Bsub has empty columns, compress iscol_o such that it will retrieve condensed Bsub from a->B during reuse */
3323     asub = (Mat_MPIAIJ *)M->data;
3324 
3325     PetscCall(ISGetLocalSize(iscol_o, &BsubN));
3326     n = asub->B->cmap->N;
3327     if (BsubN > n) {
3328       /* This case can be tested using ~petsc/src/tao/bound/tutorials/runplate2_3 */
3329       const PetscInt *idx;
3330       PetscInt        i, j, *idx_new, *subgarray = asub->garray;
3331       PetscCall(PetscInfo(M, "submatrix Bn %" PetscInt_FMT " != BsubN %" PetscInt_FMT ", update iscol_o\n", n, BsubN));
3332 
3333       PetscCall(PetscMalloc1(n, &idx_new));
3334       j = 0;
3335       PetscCall(ISGetIndices(iscol_o, &idx));
3336       for (i = 0; i < n; i++) {
3337         if (j >= BsubN) break;
3338         while (subgarray[i] > garray[j]) j++;
3339 
3340         if (subgarray[i] == garray[j]) {
3341           idx_new[i] = idx[j++];
3342         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "subgarray[%" PetscInt_FMT "]=%" PetscInt_FMT " cannot < garray[%" PetscInt_FMT "]=%" PetscInt_FMT, i, subgarray[i], j, garray[j]);
3343       }
3344       PetscCall(ISRestoreIndices(iscol_o, &idx));
3345 
3346       PetscCall(ISDestroy(&iscol_o));
3347       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, idx_new, PETSC_OWN_POINTER, &iscol_o));
3348 
3349     } else if (BsubN < n) {
3350       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Columns of Bsub (%" PetscInt_FMT ") cannot be smaller than B's (%" PetscInt_FMT ")", BsubN, asub->B->cmap->N);
3351     }
3352 
3353     PetscCall(PetscFree(garray));
3354     *submat = M;
3355 
3356     /* Save isrow_d, iscol_d and iscol_o used in processor for next request */
3357     PetscCall(PetscObjectCompose((PetscObject)M, "isrow_d", (PetscObject)isrow_d));
3358     PetscCall(ISDestroy(&isrow_d));
3359 
3360     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_d", (PetscObject)iscol_d));
3361     PetscCall(ISDestroy(&iscol_d));
3362 
3363     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_o", (PetscObject)iscol_o));
3364     PetscCall(ISDestroy(&iscol_o));
3365   }
3366   PetscFunctionReturn(PETSC_SUCCESS);
3367 }
3368 
3369 PetscErrorCode MatCreateSubMatrix_MPIAIJ(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *newmat)
3370 {
3371   IS        iscol_local = NULL, isrow_d;
3372   PetscInt  csize;
3373   PetscInt  n, i, j, start, end;
3374   PetscBool sameRowDist = PETSC_FALSE, sameDist[2], tsameDist[2];
3375   MPI_Comm  comm;
3376 
3377   PetscFunctionBegin;
3378   /* If isrow has same processor distribution as mat,
3379      call MatCreateSubMatrix_MPIAIJ_SameRowDist() to avoid using a hash table with global size of iscol */
3380   if (call == MAT_REUSE_MATRIX) {
3381     PetscCall(PetscObjectQuery((PetscObject)*newmat, "isrow_d", (PetscObject *)&isrow_d));
3382     if (isrow_d) {
3383       sameRowDist  = PETSC_TRUE;
3384       tsameDist[1] = PETSC_TRUE; /* sameColDist */
3385     } else {
3386       PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_local));
3387       if (iscol_local) {
3388         sameRowDist  = PETSC_TRUE;
3389         tsameDist[1] = PETSC_FALSE; /* !sameColDist */
3390       }
3391     }
3392   } else {
3393     /* Check if isrow has same processor distribution as mat */
3394     sameDist[0] = PETSC_FALSE;
3395     PetscCall(ISGetLocalSize(isrow, &n));
3396     if (!n) {
3397       sameDist[0] = PETSC_TRUE;
3398     } else {
3399       PetscCall(ISGetMinMax(isrow, &i, &j));
3400       PetscCall(MatGetOwnershipRange(mat, &start, &end));
3401       if (i >= start && j < end) sameDist[0] = PETSC_TRUE;
3402     }
3403 
3404     /* Check if iscol has same processor distribution as mat */
3405     sameDist[1] = PETSC_FALSE;
3406     PetscCall(ISGetLocalSize(iscol, &n));
3407     if (!n) {
3408       sameDist[1] = PETSC_TRUE;
3409     } else {
3410       PetscCall(ISGetMinMax(iscol, &i, &j));
3411       PetscCall(MatGetOwnershipRangeColumn(mat, &start, &end));
3412       if (i >= start && j < end) sameDist[1] = PETSC_TRUE;
3413     }
3414 
3415     PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3416     PetscCallMPI(MPIU_Allreduce(&sameDist, &tsameDist, 2, MPIU_BOOL, MPI_LAND, comm));
3417     sameRowDist = tsameDist[0];
3418   }
3419 
3420   if (sameRowDist) {
3421     if (tsameDist[1]) { /* sameRowDist & sameColDist */
3422       /* isrow and iscol have same processor distribution as mat */
3423       PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowColDist(mat, isrow, iscol, call, newmat));
3424       PetscFunctionReturn(PETSC_SUCCESS);
3425     } else { /* sameRowDist */
3426       /* isrow has same processor distribution as mat */
3427       if (call == MAT_INITIAL_MATRIX) {
3428         PetscBool sorted;
3429         PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3430         PetscCall(ISGetLocalSize(iscol_local, &n)); /* local size of iscol_local = global columns of newmat */
3431         PetscCall(ISGetSize(iscol, &i));
3432         PetscCheck(n == i, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "n %" PetscInt_FMT " != size of iscol %" PetscInt_FMT, n, i);
3433 
3434         PetscCall(ISSorted(iscol_local, &sorted));
3435         if (sorted) {
3436           /* MatCreateSubMatrix_MPIAIJ_SameRowDist() requires iscol_local be sorted; it can have duplicate indices */
3437           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, iscol_local, MAT_INITIAL_MATRIX, newmat));
3438           PetscFunctionReturn(PETSC_SUCCESS);
3439         }
3440       } else { /* call == MAT_REUSE_MATRIX */
3441         IS iscol_sub;
3442         PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3443         if (iscol_sub) {
3444           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, NULL, call, newmat));
3445           PetscFunctionReturn(PETSC_SUCCESS);
3446         }
3447       }
3448     }
3449   }
3450 
3451   /* General case: iscol -> iscol_local which has global size of iscol */
3452   if (call == MAT_REUSE_MATRIX) {
3453     PetscCall(PetscObjectQuery((PetscObject)*newmat, "ISAllGather", (PetscObject *)&iscol_local));
3454     PetscCheck(iscol_local, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3455   } else {
3456     if (!iscol_local) PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3457   }
3458 
3459   PetscCall(ISGetLocalSize(iscol, &csize));
3460   PetscCall(MatCreateSubMatrix_MPIAIJ_nonscalable(mat, isrow, iscol_local, csize, call, newmat));
3461 
3462   if (call == MAT_INITIAL_MATRIX) {
3463     PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3464     PetscCall(ISDestroy(&iscol_local));
3465   }
3466   PetscFunctionReturn(PETSC_SUCCESS);
3467 }
3468 
3469 /*@C
3470   MatCreateMPIAIJWithSeqAIJ - creates a `MATMPIAIJ` matrix using `MATSEQAIJ` matrices that contain the "diagonal"
3471   and "off-diagonal" part of the matrix in CSR format.
3472 
3473   Collective
3474 
3475   Input Parameters:
3476 + comm   - MPI communicator
3477 . A      - "diagonal" portion of matrix
3478 . B      - "off-diagonal" portion of matrix, may have empty columns, will be destroyed by this routine
3479 - garray - global index of `B` columns
3480 
3481   Output Parameter:
3482 . mat - the matrix, with input `A` as its local diagonal matrix
3483 
3484   Level: advanced
3485 
3486   Notes:
3487   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix.
3488 
3489   `A` becomes part of output mat, `B` is destroyed by this routine. The user cannot use `A` and `B` anymore.
3490 
3491 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateMPIAIJWithSplitArrays()`
3492 @*/
3493 PetscErrorCode MatCreateMPIAIJWithSeqAIJ(MPI_Comm comm, Mat A, Mat B, const PetscInt garray[], Mat *mat)
3494 {
3495   Mat_MPIAIJ        *maij;
3496   Mat_SeqAIJ        *b  = (Mat_SeqAIJ *)B->data, *bnew;
3497   PetscInt          *oi = b->i, *oj = b->j, i, nz, col;
3498   const PetscScalar *oa;
3499   Mat                Bnew;
3500   PetscInt           m, n, N;
3501   MatType            mpi_mat_type;
3502 
3503   PetscFunctionBegin;
3504   PetscCall(MatCreate(comm, mat));
3505   PetscCall(MatGetSize(A, &m, &n));
3506   PetscCheck(m == B->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Am %" PetscInt_FMT " != Bm %" PetscInt_FMT, m, B->rmap->N);
3507   PetscCheck(PetscAbs(A->rmap->bs) == PetscAbs(B->rmap->bs), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "A row bs %" PetscInt_FMT " != B row bs %" PetscInt_FMT, A->rmap->bs, B->rmap->bs);
3508   /* remove check below; When B is created using iscol_o from ISGetSeqIS_SameColDist_Private(), its bs may not be same as A */
3509   /* PetscCheck(A->cmap->bs == B->cmap->bs,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"A column bs %" PetscInt_FMT " != B column bs %" PetscInt_FMT,A->cmap->bs,B->cmap->bs); */
3510 
3511   /* Get global columns of mat */
3512   PetscCallMPI(MPIU_Allreduce(&n, &N, 1, MPIU_INT, MPI_SUM, comm));
3513 
3514   PetscCall(MatSetSizes(*mat, m, n, PETSC_DECIDE, N));
3515   /* Determine the type of MPI matrix that should be created from the type of matrix A, which holds the "diagonal" portion. */
3516   PetscCall(MatGetMPIMatType_Private(A, &mpi_mat_type));
3517   PetscCall(MatSetType(*mat, mpi_mat_type));
3518 
3519   if (A->rmap->bs > 1 || A->cmap->bs > 1) PetscCall(MatSetBlockSizes(*mat, A->rmap->bs, A->cmap->bs));
3520   maij = (Mat_MPIAIJ *)(*mat)->data;
3521 
3522   (*mat)->preallocated = PETSC_TRUE;
3523 
3524   PetscCall(PetscLayoutSetUp((*mat)->rmap));
3525   PetscCall(PetscLayoutSetUp((*mat)->cmap));
3526 
3527   /* Set A as diagonal portion of *mat */
3528   maij->A = A;
3529 
3530   nz = oi[m];
3531   for (i = 0; i < nz; i++) {
3532     col   = oj[i];
3533     oj[i] = garray[col];
3534   }
3535 
3536   /* Set Bnew as off-diagonal portion of *mat */
3537   PetscCall(MatSeqAIJGetArrayRead(B, &oa));
3538   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, N, oi, oj, (PetscScalar *)oa, &Bnew));
3539   PetscCall(MatSeqAIJRestoreArrayRead(B, &oa));
3540   bnew        = (Mat_SeqAIJ *)Bnew->data;
3541   bnew->maxnz = b->maxnz; /* allocated nonzeros of B */
3542   maij->B     = Bnew;
3543 
3544   PetscCheck(B->rmap->N == Bnew->rmap->N, PETSC_COMM_SELF, PETSC_ERR_PLIB, "BN %" PetscInt_FMT " != BnewN %" PetscInt_FMT, B->rmap->N, Bnew->rmap->N);
3545 
3546   b->free_a  = PETSC_FALSE;
3547   b->free_ij = PETSC_FALSE;
3548   PetscCall(MatDestroy(&B));
3549 
3550   bnew->free_a  = PETSC_TRUE;
3551   bnew->free_ij = PETSC_TRUE;
3552 
3553   /* condense columns of maij->B */
3554   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
3555   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
3556   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
3557   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
3558   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3559   PetscFunctionReturn(PETSC_SUCCESS);
3560 }
3561 
3562 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, PetscBool, Mat *);
3563 
3564 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowDist(Mat mat, IS isrow, IS iscol, IS iscol_local, MatReuse call, Mat *newmat)
3565 {
3566   PetscInt        i, m, n, rstart, row, rend, nz, j, bs, cbs;
3567   PetscInt       *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3568   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)mat->data;
3569   Mat             M, Msub, B = a->B;
3570   MatScalar      *aa;
3571   Mat_SeqAIJ     *aij;
3572   PetscInt       *garray = a->garray, *colsub, Ncols;
3573   PetscInt        count, Bn = B->cmap->N, cstart = mat->cmap->rstart, cend = mat->cmap->rend;
3574   IS              iscol_sub, iscmap;
3575   const PetscInt *is_idx, *cmap;
3576   PetscBool       allcolumns = PETSC_FALSE;
3577   MPI_Comm        comm;
3578 
3579   PetscFunctionBegin;
3580   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3581   if (call == MAT_REUSE_MATRIX) {
3582     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3583     PetscCheck(iscol_sub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "SubIScol passed in was not used before, cannot reuse");
3584     PetscCall(ISGetLocalSize(iscol_sub, &count));
3585 
3586     PetscCall(PetscObjectQuery((PetscObject)*newmat, "Subcmap", (PetscObject *)&iscmap));
3587     PetscCheck(iscmap, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Subcmap passed in was not used before, cannot reuse");
3588 
3589     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Msub));
3590     PetscCheck(Msub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3591 
3592     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_REUSE_MATRIX, PETSC_FALSE, &Msub));
3593 
3594   } else { /* call == MAT_INITIAL_MATRIX) */
3595     PetscBool flg;
3596 
3597     PetscCall(ISGetLocalSize(iscol, &n));
3598     PetscCall(ISGetSize(iscol, &Ncols));
3599 
3600     /* (1) iscol -> nonscalable iscol_local */
3601     /* Check for special case: each processor gets entire matrix columns */
3602     PetscCall(ISIdentity(iscol_local, &flg));
3603     if (flg && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3604     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3605     if (allcolumns) {
3606       iscol_sub = iscol_local;
3607       PetscCall(PetscObjectReference((PetscObject)iscol_local));
3608       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iscmap));
3609 
3610     } else {
3611       /* (2) iscol_local -> iscol_sub and iscmap. Implementation below requires iscol_local be sorted, it can have duplicate indices */
3612       PetscInt *idx, *cmap1, k;
3613       PetscCall(PetscMalloc1(Ncols, &idx));
3614       PetscCall(PetscMalloc1(Ncols, &cmap1));
3615       PetscCall(ISGetIndices(iscol_local, &is_idx));
3616       count = 0;
3617       k     = 0;
3618       for (i = 0; i < Ncols; i++) {
3619         j = is_idx[i];
3620         if (j >= cstart && j < cend) {
3621           /* diagonal part of mat */
3622           idx[count]     = j;
3623           cmap1[count++] = i; /* column index in submat */
3624         } else if (Bn) {
3625           /* off-diagonal part of mat */
3626           if (j == garray[k]) {
3627             idx[count]     = j;
3628             cmap1[count++] = i; /* column index in submat */
3629           } else if (j > garray[k]) {
3630             while (j > garray[k] && k < Bn - 1) k++;
3631             if (j == garray[k]) {
3632               idx[count]     = j;
3633               cmap1[count++] = i; /* column index in submat */
3634             }
3635           }
3636         }
3637       }
3638       PetscCall(ISRestoreIndices(iscol_local, &is_idx));
3639 
3640       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_OWN_POINTER, &iscol_sub));
3641       PetscCall(ISGetBlockSize(iscol, &cbs));
3642       PetscCall(ISSetBlockSize(iscol_sub, cbs));
3643 
3644       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)iscol_local), count, cmap1, PETSC_OWN_POINTER, &iscmap));
3645     }
3646 
3647     /* (3) Create sequential Msub */
3648     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_INITIAL_MATRIX, allcolumns, &Msub));
3649   }
3650 
3651   PetscCall(ISGetLocalSize(iscol_sub, &count));
3652   aij = (Mat_SeqAIJ *)Msub->data;
3653   ii  = aij->i;
3654   PetscCall(ISGetIndices(iscmap, &cmap));
3655 
3656   /*
3657       m - number of local rows
3658       Ncols - number of columns (same on all processors)
3659       rstart - first row in new global matrix generated
3660   */
3661   PetscCall(MatGetSize(Msub, &m, NULL));
3662 
3663   if (call == MAT_INITIAL_MATRIX) {
3664     /* (4) Create parallel newmat */
3665     PetscMPIInt rank, size;
3666     PetscInt    csize;
3667 
3668     PetscCallMPI(MPI_Comm_size(comm, &size));
3669     PetscCallMPI(MPI_Comm_rank(comm, &rank));
3670 
3671     /*
3672         Determine the number of non-zeros in the diagonal and off-diagonal
3673         portions of the matrix in order to do correct preallocation
3674     */
3675 
3676     /* first get start and end of "diagonal" columns */
3677     PetscCall(ISGetLocalSize(iscol, &csize));
3678     if (csize == PETSC_DECIDE) {
3679       PetscCall(ISGetSize(isrow, &mglobal));
3680       if (mglobal == Ncols) { /* square matrix */
3681         nlocal = m;
3682       } else {
3683         nlocal = Ncols / size + ((Ncols % size) > rank);
3684       }
3685     } else {
3686       nlocal = csize;
3687     }
3688     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3689     rstart = rend - nlocal;
3690     PetscCheck(rank != size - 1 || rend == Ncols, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, Ncols);
3691 
3692     /* next, compute all the lengths */
3693     jj = aij->j;
3694     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3695     olens = dlens + m;
3696     for (i = 0; i < m; i++) {
3697       jend = ii[i + 1] - ii[i];
3698       olen = 0;
3699       dlen = 0;
3700       for (j = 0; j < jend; j++) {
3701         if (cmap[*jj] < rstart || cmap[*jj] >= rend) olen++;
3702         else dlen++;
3703         jj++;
3704       }
3705       olens[i] = olen;
3706       dlens[i] = dlen;
3707     }
3708 
3709     PetscCall(ISGetBlockSize(isrow, &bs));
3710     PetscCall(ISGetBlockSize(iscol, &cbs));
3711 
3712     PetscCall(MatCreate(comm, &M));
3713     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, Ncols));
3714     PetscCall(MatSetBlockSizes(M, bs, cbs));
3715     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3716     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3717     PetscCall(PetscFree(dlens));
3718 
3719   } else { /* call == MAT_REUSE_MATRIX */
3720     M = *newmat;
3721     PetscCall(MatGetLocalSize(M, &i, NULL));
3722     PetscCheck(i == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3723     PetscCall(MatZeroEntries(M));
3724     /*
3725          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3726        rather than the slower MatSetValues().
3727     */
3728     M->was_assembled = PETSC_TRUE;
3729     M->assembled     = PETSC_FALSE;
3730   }
3731 
3732   /* (5) Set values of Msub to *newmat */
3733   PetscCall(PetscMalloc1(count, &colsub));
3734   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
3735 
3736   jj = aij->j;
3737   PetscCall(MatSeqAIJGetArrayRead(Msub, (const PetscScalar **)&aa));
3738   for (i = 0; i < m; i++) {
3739     row = rstart + i;
3740     nz  = ii[i + 1] - ii[i];
3741     for (j = 0; j < nz; j++) colsub[j] = cmap[jj[j]];
3742     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, colsub, aa, INSERT_VALUES));
3743     jj += nz;
3744     aa += nz;
3745   }
3746   PetscCall(MatSeqAIJRestoreArrayRead(Msub, (const PetscScalar **)&aa));
3747   PetscCall(ISRestoreIndices(iscmap, &cmap));
3748 
3749   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3750   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3751 
3752   PetscCall(PetscFree(colsub));
3753 
3754   /* save Msub, iscol_sub and iscmap used in processor for next request */
3755   if (call == MAT_INITIAL_MATRIX) {
3756     *newmat = M;
3757     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubMatrix", (PetscObject)Msub));
3758     PetscCall(MatDestroy(&Msub));
3759 
3760     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubIScol", (PetscObject)iscol_sub));
3761     PetscCall(ISDestroy(&iscol_sub));
3762 
3763     PetscCall(PetscObjectCompose((PetscObject)*newmat, "Subcmap", (PetscObject)iscmap));
3764     PetscCall(ISDestroy(&iscmap));
3765 
3766     if (iscol_local) {
3767       PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3768       PetscCall(ISDestroy(&iscol_local));
3769     }
3770   }
3771   PetscFunctionReturn(PETSC_SUCCESS);
3772 }
3773 
3774 /*
3775     Not great since it makes two copies of the submatrix, first an SeqAIJ
3776   in local and then by concatenating the local matrices the end result.
3777   Writing it directly would be much like MatCreateSubMatrices_MPIAIJ()
3778 
3779   This requires a sequential iscol with all indices.
3780 */
3781 PetscErrorCode MatCreateSubMatrix_MPIAIJ_nonscalable(Mat mat, IS isrow, IS iscol, PetscInt csize, MatReuse call, Mat *newmat)
3782 {
3783   PetscMPIInt rank, size;
3784   PetscInt    i, m, n, rstart, row, rend, nz, *cwork, j, bs, cbs;
3785   PetscInt   *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3786   Mat         M, Mreuse;
3787   MatScalar  *aa, *vwork;
3788   MPI_Comm    comm;
3789   Mat_SeqAIJ *aij;
3790   PetscBool   colflag, allcolumns = PETSC_FALSE;
3791 
3792   PetscFunctionBegin;
3793   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3794   PetscCallMPI(MPI_Comm_rank(comm, &rank));
3795   PetscCallMPI(MPI_Comm_size(comm, &size));
3796 
3797   /* Check for special case: each processor gets entire matrix columns */
3798   PetscCall(ISIdentity(iscol, &colflag));
3799   PetscCall(ISGetLocalSize(iscol, &n));
3800   if (colflag && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3801   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3802 
3803   if (call == MAT_REUSE_MATRIX) {
3804     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Mreuse));
3805     PetscCheck(Mreuse, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3806     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_REUSE_MATRIX, allcolumns, &Mreuse));
3807   } else {
3808     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_INITIAL_MATRIX, allcolumns, &Mreuse));
3809   }
3810 
3811   /*
3812       m - number of local rows
3813       n - number of columns (same on all processors)
3814       rstart - first row in new global matrix generated
3815   */
3816   PetscCall(MatGetSize(Mreuse, &m, &n));
3817   PetscCall(MatGetBlockSizes(Mreuse, &bs, &cbs));
3818   if (call == MAT_INITIAL_MATRIX) {
3819     aij = (Mat_SeqAIJ *)Mreuse->data;
3820     ii  = aij->i;
3821     jj  = aij->j;
3822 
3823     /*
3824         Determine the number of non-zeros in the diagonal and off-diagonal
3825         portions of the matrix in order to do correct preallocation
3826     */
3827 
3828     /* first get start and end of "diagonal" columns */
3829     if (csize == PETSC_DECIDE) {
3830       PetscCall(ISGetSize(isrow, &mglobal));
3831       if (mglobal == n) { /* square matrix */
3832         nlocal = m;
3833       } else {
3834         nlocal = n / size + ((n % size) > rank);
3835       }
3836     } else {
3837       nlocal = csize;
3838     }
3839     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3840     rstart = rend - nlocal;
3841     PetscCheck(rank != size - 1 || rend == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, n);
3842 
3843     /* next, compute all the lengths */
3844     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3845     olens = dlens + m;
3846     for (i = 0; i < m; i++) {
3847       jend = ii[i + 1] - ii[i];
3848       olen = 0;
3849       dlen = 0;
3850       for (j = 0; j < jend; j++) {
3851         if (*jj < rstart || *jj >= rend) olen++;
3852         else dlen++;
3853         jj++;
3854       }
3855       olens[i] = olen;
3856       dlens[i] = dlen;
3857     }
3858     PetscCall(MatCreate(comm, &M));
3859     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, n));
3860     PetscCall(MatSetBlockSizes(M, bs, cbs));
3861     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3862     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3863     PetscCall(PetscFree(dlens));
3864   } else {
3865     PetscInt ml, nl;
3866 
3867     M = *newmat;
3868     PetscCall(MatGetLocalSize(M, &ml, &nl));
3869     PetscCheck(ml == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3870     PetscCall(MatZeroEntries(M));
3871     /*
3872          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3873        rather than the slower MatSetValues().
3874     */
3875     M->was_assembled = PETSC_TRUE;
3876     M->assembled     = PETSC_FALSE;
3877   }
3878   PetscCall(MatGetOwnershipRange(M, &rstart, &rend));
3879   aij = (Mat_SeqAIJ *)Mreuse->data;
3880   ii  = aij->i;
3881   jj  = aij->j;
3882 
3883   /* trigger copy to CPU if needed */
3884   PetscCall(MatSeqAIJGetArrayRead(Mreuse, (const PetscScalar **)&aa));
3885   for (i = 0; i < m; i++) {
3886     row   = rstart + i;
3887     nz    = ii[i + 1] - ii[i];
3888     cwork = jj;
3889     jj    = PetscSafePointerPlusOffset(jj, nz);
3890     vwork = aa;
3891     aa    = PetscSafePointerPlusOffset(aa, nz);
3892     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, cwork, vwork, INSERT_VALUES));
3893   }
3894   PetscCall(MatSeqAIJRestoreArrayRead(Mreuse, (const PetscScalar **)&aa));
3895 
3896   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3897   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3898   *newmat = M;
3899 
3900   /* save submatrix used in processor for next request */
3901   if (call == MAT_INITIAL_MATRIX) {
3902     PetscCall(PetscObjectCompose((PetscObject)M, "SubMatrix", (PetscObject)Mreuse));
3903     PetscCall(MatDestroy(&Mreuse));
3904   }
3905   PetscFunctionReturn(PETSC_SUCCESS);
3906 }
3907 
3908 static PetscErrorCode MatMPIAIJSetPreallocationCSR_MPIAIJ(Mat B, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
3909 {
3910   PetscInt        m, cstart, cend, j, nnz, i, d, *ld;
3911   PetscInt       *d_nnz, *o_nnz, nnz_max = 0, rstart, ii, irstart;
3912   const PetscInt *JJ;
3913   PetscBool       nooffprocentries;
3914   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)B->data;
3915 
3916   PetscFunctionBegin;
3917   PetscCall(PetscLayoutSetUp(B->rmap));
3918   PetscCall(PetscLayoutSetUp(B->cmap));
3919   m       = B->rmap->n;
3920   cstart  = B->cmap->rstart;
3921   cend    = B->cmap->rend;
3922   rstart  = B->rmap->rstart;
3923   irstart = Ii[0];
3924 
3925   PetscCall(PetscCalloc2(m, &d_nnz, m, &o_nnz));
3926 
3927   if (PetscDefined(USE_DEBUG)) {
3928     for (i = 0; i < m; i++) {
3929       nnz = Ii[i + 1] - Ii[i];
3930       JJ  = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3931       PetscCheck(nnz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative %" PetscInt_FMT " number of columns", i, nnz);
3932       PetscCheck(!nnz || !(JJ[0] < 0), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " starts with negative column index %" PetscInt_FMT, i, JJ[0]);
3933       PetscCheck(!nnz || !(JJ[nnz - 1] >= B->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " ends with too large a column index %" PetscInt_FMT " (max allowed %" PetscInt_FMT ")", i, JJ[nnz - 1], B->cmap->N);
3934     }
3935   }
3936 
3937   for (i = 0; i < m; i++) {
3938     nnz     = Ii[i + 1] - Ii[i];
3939     JJ      = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3940     nnz_max = PetscMax(nnz_max, nnz);
3941     d       = 0;
3942     for (j = 0; j < nnz; j++) {
3943       if (cstart <= JJ[j] && JJ[j] < cend) d++;
3944     }
3945     d_nnz[i] = d;
3946     o_nnz[i] = nnz - d;
3947   }
3948   PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
3949   PetscCall(PetscFree2(d_nnz, o_nnz));
3950 
3951   for (i = 0; i < m; i++) {
3952     ii = i + rstart;
3953     PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, Ii[i + 1] - Ii[i], PetscSafePointerPlusOffset(J, Ii[i] - irstart), PetscSafePointerPlusOffset(v, Ii[i] - irstart), INSERT_VALUES));
3954   }
3955   nooffprocentries    = B->nooffprocentries;
3956   B->nooffprocentries = PETSC_TRUE;
3957   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3958   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3959   B->nooffprocentries = nooffprocentries;
3960 
3961   /* count number of entries below block diagonal */
3962   PetscCall(PetscFree(Aij->ld));
3963   PetscCall(PetscCalloc1(m, &ld));
3964   Aij->ld = ld;
3965   for (i = 0; i < m; i++) {
3966     nnz = Ii[i + 1] - Ii[i];
3967     j   = 0;
3968     while (j < nnz && J[j] < cstart) j++;
3969     ld[i] = j;
3970     if (J) J += nnz;
3971   }
3972 
3973   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3974   PetscFunctionReturn(PETSC_SUCCESS);
3975 }
3976 
3977 /*@
3978   MatMPIAIJSetPreallocationCSR - Allocates memory for a sparse parallel matrix in `MATAIJ` format
3979   (the default parallel PETSc format).
3980 
3981   Collective
3982 
3983   Input Parameters:
3984 + B - the matrix
3985 . i - the indices into `j` for the start of each local row (indices start with zero)
3986 . j - the column indices for each local row (indices start with zero)
3987 - v - optional values in the matrix
3988 
3989   Level: developer
3990 
3991   Notes:
3992   The `i`, `j`, and `v` arrays ARE copied by this routine into the internal format used by PETSc;
3993   thus you CANNOT change the matrix entries by changing the values of `v` after you have
3994   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
3995 
3996   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
3997 
3998   A convenience routine for this functionality is `MatCreateMPIAIJWithArrays()`.
3999 
4000   You can update the matrix with new numerical values using `MatUpdateMPIAIJWithArrays()` after this call if the column indices in `j` are sorted.
4001 
4002   If you do **not** use `MatUpdateMPIAIJWithArrays()`, the column indices in `j` do not need to be sorted. If you will use
4003   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4004 
4005   The format which is used for the sparse matrix input, is equivalent to a
4006   row-major ordering.. i.e for the following matrix, the input data expected is
4007   as shown
4008 .vb
4009         1 0 0
4010         2 0 3     P0
4011        -------
4012         4 5 6     P1
4013 
4014      Process0 [P0] rows_owned=[0,1]
4015         i =  {0,1,3}  [size = nrow+1  = 2+1]
4016         j =  {0,0,2}  [size = 3]
4017         v =  {1,2,3}  [size = 3]
4018 
4019      Process1 [P1] rows_owned=[2]
4020         i =  {0,3}    [size = nrow+1  = 1+1]
4021         j =  {0,1,2}  [size = 3]
4022         v =  {4,5,6}  [size = 3]
4023 .ve
4024 
4025 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatCreateAIJ()`,
4026           `MatCreateSeqAIJWithArrays()`, `MatCreateMPIAIJWithSplitArrays()`, `MatCreateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4027 @*/
4028 PetscErrorCode MatMPIAIJSetPreallocationCSR(Mat B, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
4029 {
4030   PetscFunctionBegin;
4031   PetscTryMethod(B, "MatMPIAIJSetPreallocationCSR_C", (Mat, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, i, j, v));
4032   PetscFunctionReturn(PETSC_SUCCESS);
4033 }
4034 
4035 /*@
4036   MatMPIAIJSetPreallocation - Preallocates memory for a sparse parallel matrix in `MATMPIAIJ` format
4037   (the default parallel PETSc format).  For good matrix assembly performance
4038   the user should preallocate the matrix storage by setting the parameters
4039   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4040 
4041   Collective
4042 
4043   Input Parameters:
4044 + B     - the matrix
4045 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4046            (same value is used for all local rows)
4047 . d_nnz - array containing the number of nonzeros in the various rows of the
4048            DIAGONAL portion of the local submatrix (possibly different for each row)
4049            or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `d_nz` is used to specify the nonzero structure.
4050            The size of this array is equal to the number of local rows, i.e 'm'.
4051            For matrices that will be factored, you must leave room for (and set)
4052            the diagonal entry even if it is zero.
4053 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4054            submatrix (same value is used for all local rows).
4055 - o_nnz - array containing the number of nonzeros in the various rows of the
4056            OFF-DIAGONAL portion of the local submatrix (possibly different for
4057            each row) or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `o_nz` is used to specify the nonzero
4058            structure. The size of this array is equal to the number
4059            of local rows, i.e 'm'.
4060 
4061   Example Usage:
4062   Consider the following 8x8 matrix with 34 non-zero values, that is
4063   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4064   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4065   as follows
4066 
4067 .vb
4068             1  2  0  |  0  3  0  |  0  4
4069     Proc0   0  5  6  |  7  0  0  |  8  0
4070             9  0 10  | 11  0  0  | 12  0
4071     -------------------------------------
4072            13  0 14  | 15 16 17  |  0  0
4073     Proc1   0 18  0  | 19 20 21  |  0  0
4074             0  0  0  | 22 23  0  | 24  0
4075     -------------------------------------
4076     Proc2  25 26 27  |  0  0 28  | 29  0
4077            30  0  0  | 31 32 33  |  0 34
4078 .ve
4079 
4080   This can be represented as a collection of submatrices as
4081 .vb
4082       A B C
4083       D E F
4084       G H I
4085 .ve
4086 
4087   Where the submatrices A,B,C are owned by proc0, D,E,F are
4088   owned by proc1, G,H,I are owned by proc2.
4089 
4090   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4091   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4092   The 'M','N' parameters are 8,8, and have the same values on all procs.
4093 
4094   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4095   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4096   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4097   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4098   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4099   matrix, and [DF] as another `MATSEQAIJ` matrix.
4100 
4101   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4102   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4103   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4104   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4105   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4106   In this case, the values of `d_nz`, `o_nz` are
4107 .vb
4108      proc0  dnz = 2, o_nz = 2
4109      proc1  dnz = 3, o_nz = 2
4110      proc2  dnz = 1, o_nz = 4
4111 .ve
4112   We are allocating `m`*(`d_nz`+`o_nz`) storage locations for every proc. This
4113   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4114   for proc3. i.e we are using 12+15+10=37 storage locations to store
4115   34 values.
4116 
4117   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4118   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4119   In the above case the values for `d_nnz`, `o_nnz` are
4120 .vb
4121      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4122      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4123      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4124 .ve
4125   Here the space allocated is sum of all the above values i.e 34, and
4126   hence pre-allocation is perfect.
4127 
4128   Level: intermediate
4129 
4130   Notes:
4131   If the *_nnz parameter is given then the *_nz parameter is ignored
4132 
4133   The `MATAIJ` format, also called compressed row storage (CSR), is compatible with standard Fortran
4134   storage.  The stored row and column indices begin with zero.
4135   See [Sparse Matrices](sec_matsparse) for details.
4136 
4137   The parallel matrix is partitioned such that the first m0 rows belong to
4138   process 0, the next m1 rows belong to process 1, the next m2 rows belong
4139   to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
4140 
4141   The DIAGONAL portion of the local submatrix of a processor can be defined
4142   as the submatrix which is obtained by extraction the part corresponding to
4143   the rows r1-r2 and columns c1-c2 of the global matrix, where r1 is the
4144   first row that belongs to the processor, r2 is the last row belonging to
4145   the this processor, and c1-c2 is range of indices of the local part of a
4146   vector suitable for applying the matrix to.  This is an mxn matrix.  In the
4147   common case of a square matrix, the row and column ranges are the same and
4148   the DIAGONAL part is also square. The remaining portion of the local
4149   submatrix (mxN) constitute the OFF-DIAGONAL portion.
4150 
4151   If `o_nnz` and `d_nnz` are specified, then `o_nz` and `d_nz` are ignored.
4152 
4153   You can call `MatGetInfo()` to get information on how effective the preallocation was;
4154   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
4155   You can also run with the option `-info` and look for messages with the string
4156   malloc in them to see if additional memory allocation was needed.
4157 
4158 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MATMPIAIJ`, `MATAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateAIJ()`, `MatMPIAIJSetPreallocationCSR()`,
4159           `MatGetInfo()`, `PetscSplitOwnership()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4160 @*/
4161 PetscErrorCode MatMPIAIJSetPreallocation(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
4162 {
4163   PetscFunctionBegin;
4164   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
4165   PetscValidType(B, 1);
4166   PetscTryMethod(B, "MatMPIAIJSetPreallocation_C", (Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[]), (B, d_nz, d_nnz, o_nz, o_nnz));
4167   PetscFunctionReturn(PETSC_SUCCESS);
4168 }
4169 
4170 /*@
4171   MatCreateMPIAIJWithArrays - creates a `MATMPIAIJ` matrix using arrays that contain in standard
4172   CSR format for the local rows.
4173 
4174   Collective
4175 
4176   Input Parameters:
4177 + comm - MPI communicator
4178 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
4179 . n    - This value should be the same as the local size used in creating the
4180          x vector for the matrix-vector product $ y = Ax$. (or `PETSC_DECIDE` to have
4181          calculated if `N` is given) For square matrices n is almost always `m`.
4182 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
4183 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
4184 . i    - row indices (of length m+1); that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
4185 . j    - global column indices
4186 - a    - optional matrix values
4187 
4188   Output Parameter:
4189 . mat - the matrix
4190 
4191   Level: intermediate
4192 
4193   Notes:
4194   The `i`, `j`, and `a` arrays ARE copied by this routine into the internal format used by PETSc;
4195   thus you CANNOT change the matrix entries by changing the values of `a[]` after you have
4196   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
4197 
4198   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
4199 
4200   Once you have created the matrix you can update it with new numerical values using `MatUpdateMPIAIJWithArray()`
4201 
4202   If you do **not** use `MatUpdateMPIAIJWithArray()`, the column indices in `j` do not need to be sorted. If you will use
4203   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4204 
4205   The format which is used for the sparse matrix input, is equivalent to a
4206   row-major ordering, i.e., for the following matrix, the input data expected is
4207   as shown
4208 .vb
4209         1 0 0
4210         2 0 3     P0
4211        -------
4212         4 5 6     P1
4213 
4214      Process0 [P0] rows_owned=[0,1]
4215         i =  {0,1,3}  [size = nrow+1  = 2+1]
4216         j =  {0,0,2}  [size = 3]
4217         v =  {1,2,3}  [size = 3]
4218 
4219      Process1 [P1] rows_owned=[2]
4220         i =  {0,3}    [size = nrow+1  = 1+1]
4221         j =  {0,1,2}  [size = 3]
4222         v =  {4,5,6}  [size = 3]
4223 .ve
4224 
4225 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4226           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4227 @*/
4228 PetscErrorCode MatCreateMPIAIJWithArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt i[], const PetscInt j[], const PetscScalar a[], Mat *mat)
4229 {
4230   PetscFunctionBegin;
4231   PetscCheck(!i || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4232   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4233   PetscCall(MatCreate(comm, mat));
4234   PetscCall(MatSetSizes(*mat, m, n, M, N));
4235   /* PetscCall(MatSetBlockSizes(M,bs,cbs)); */
4236   PetscCall(MatSetType(*mat, MATMPIAIJ));
4237   PetscCall(MatMPIAIJSetPreallocationCSR(*mat, i, j, a));
4238   PetscFunctionReturn(PETSC_SUCCESS);
4239 }
4240 
4241 /*@
4242   MatUpdateMPIAIJWithArrays - updates a `MATMPIAIJ` matrix using arrays that contain in standard
4243   CSR format for the local rows. Only the numerical values are updated the other arrays must be identical to what was passed
4244   from `MatCreateMPIAIJWithArrays()`
4245 
4246   Deprecated: Use `MatUpdateMPIAIJWithArray()`
4247 
4248   Collective
4249 
4250   Input Parameters:
4251 + mat - the matrix
4252 . m   - number of local rows (Cannot be `PETSC_DECIDE`)
4253 . n   - This value should be the same as the local size used in creating the
4254        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4255        calculated if N is given) For square matrices n is almost always m.
4256 . M   - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4257 . N   - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4258 . Ii  - row indices; that is Ii[0] = 0, Ii[row] = Ii[row-1] + number of elements in that row of the matrix
4259 . J   - column indices
4260 - v   - matrix values
4261 
4262   Level: deprecated
4263 
4264 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4265           `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4266 @*/
4267 PetscErrorCode MatUpdateMPIAIJWithArrays(Mat mat, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
4268 {
4269   PetscInt        nnz, i;
4270   PetscBool       nooffprocentries;
4271   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4272   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4273   PetscScalar    *ad, *ao;
4274   PetscInt        ldi, Iii, md;
4275   const PetscInt *Adi = Ad->i;
4276   PetscInt       *ld  = Aij->ld;
4277 
4278   PetscFunctionBegin;
4279   PetscCheck(Ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4280   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4281   PetscCheck(m == mat->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of rows cannot change from call to MatUpdateMPIAIJWithArrays()");
4282   PetscCheck(n == mat->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of columns cannot change from call to MatUpdateMPIAIJWithArrays()");
4283 
4284   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4285   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4286 
4287   for (i = 0; i < m; i++) {
4288     if (PetscDefined(USE_DEBUG)) {
4289       for (PetscInt j = Ii[i] + 1; j < Ii[i + 1]; ++j) {
4290         PetscCheck(J[j] >= J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is not sorted", j - Ii[i], J[j], i);
4291         PetscCheck(J[j] != J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is identical to previous entry", j - Ii[i], J[j], i);
4292       }
4293     }
4294     nnz = Ii[i + 1] - Ii[i];
4295     Iii = Ii[i];
4296     ldi = ld[i];
4297     md  = Adi[i + 1] - Adi[i];
4298     PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4299     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4300     PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4301     ad += md;
4302     ao += nnz - md;
4303   }
4304   nooffprocentries      = mat->nooffprocentries;
4305   mat->nooffprocentries = PETSC_TRUE;
4306   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4307   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4308   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4309   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4310   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4311   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4312   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4313   mat->nooffprocentries = nooffprocentries;
4314   PetscFunctionReturn(PETSC_SUCCESS);
4315 }
4316 
4317 /*@
4318   MatUpdateMPIAIJWithArray - updates an `MATMPIAIJ` matrix using an array that contains the nonzero values
4319 
4320   Collective
4321 
4322   Input Parameters:
4323 + mat - the matrix
4324 - v   - matrix values, stored by row
4325 
4326   Level: intermediate
4327 
4328   Notes:
4329   The matrix must have been obtained with `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()`
4330 
4331   The column indices in the call to `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()` must have been sorted for this call to work correctly
4332 
4333 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4334           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4335 @*/
4336 PetscErrorCode MatUpdateMPIAIJWithArray(Mat mat, const PetscScalar v[])
4337 {
4338   PetscInt        nnz, i, m;
4339   PetscBool       nooffprocentries;
4340   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4341   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4342   Mat_SeqAIJ     *Ao  = (Mat_SeqAIJ *)Aij->B->data;
4343   PetscScalar    *ad, *ao;
4344   const PetscInt *Adi = Ad->i, *Adj = Ao->i;
4345   PetscInt        ldi, Iii, md;
4346   PetscInt       *ld = Aij->ld;
4347 
4348   PetscFunctionBegin;
4349   m = mat->rmap->n;
4350 
4351   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4352   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4353   Iii = 0;
4354   for (i = 0; i < m; i++) {
4355     nnz = Adi[i + 1] - Adi[i] + Adj[i + 1] - Adj[i];
4356     ldi = ld[i];
4357     md  = Adi[i + 1] - Adi[i];
4358     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4359     ad += md;
4360     if (ao) {
4361       PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4362       PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4363       ao += nnz - md;
4364     }
4365     Iii += nnz;
4366   }
4367   nooffprocentries      = mat->nooffprocentries;
4368   mat->nooffprocentries = PETSC_TRUE;
4369   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4370   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4371   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4372   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4373   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4374   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4375   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4376   mat->nooffprocentries = nooffprocentries;
4377   PetscFunctionReturn(PETSC_SUCCESS);
4378 }
4379 
4380 /*@
4381   MatCreateAIJ - Creates a sparse parallel matrix in `MATAIJ` format
4382   (the default parallel PETSc format).  For good matrix assembly performance
4383   the user should preallocate the matrix storage by setting the parameters
4384   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4385 
4386   Collective
4387 
4388   Input Parameters:
4389 + comm  - MPI communicator
4390 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
4391           This value should be the same as the local size used in creating the
4392           y vector for the matrix-vector product y = Ax.
4393 . n     - This value should be the same as the local size used in creating the
4394           x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4395           calculated if N is given) For square matrices n is almost always m.
4396 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4397 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4398 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4399           (same value is used for all local rows)
4400 . d_nnz - array containing the number of nonzeros in the various rows of the
4401           DIAGONAL portion of the local submatrix (possibly different for each row)
4402           or `NULL`, if `d_nz` is used to specify the nonzero structure.
4403           The size of this array is equal to the number of local rows, i.e 'm'.
4404 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4405           submatrix (same value is used for all local rows).
4406 - o_nnz - array containing the number of nonzeros in the various rows of the
4407           OFF-DIAGONAL portion of the local submatrix (possibly different for
4408           each row) or `NULL`, if `o_nz` is used to specify the nonzero
4409           structure. The size of this array is equal to the number
4410           of local rows, i.e 'm'.
4411 
4412   Output Parameter:
4413 . A - the matrix
4414 
4415   Options Database Keys:
4416 + -mat_no_inode                     - Do not use inodes
4417 . -mat_inode_limit <limit>          - Sets inode limit (max limit=5)
4418 - -matmult_vecscatter_view <viewer> - View the vecscatter (i.e., communication pattern) used in `MatMult()` of sparse parallel matrices.
4419                                       See viewer types in manual of `MatView()`. Of them, ascii_matlab, draw or binary cause the `VecScatter`
4420                                       to be viewed as a matrix. Entry (i,j) is the size of message (in bytes) rank i sends to rank j in one `MatMult()` call.
4421 
4422   Level: intermediate
4423 
4424   Notes:
4425   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
4426   MatXXXXSetPreallocation() paradigm instead of this routine directly.
4427   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
4428 
4429   If the *_nnz parameter is given then the *_nz parameter is ignored
4430 
4431   The `m`,`n`,`M`,`N` parameters specify the size of the matrix, and its partitioning across
4432   processors, while `d_nz`,`d_nnz`,`o_nz`,`o_nnz` parameters specify the approximate
4433   storage requirements for this matrix.
4434 
4435   If `PETSC_DECIDE` or  `PETSC_DETERMINE` is used for a particular argument on one
4436   processor than it must be used on all processors that share the object for
4437   that argument.
4438 
4439   If `m` and `n` are not `PETSC_DECIDE`, then the values determine the `PetscLayout` of the matrix and the ranges returned by
4440   `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`, and `MatGetOwnershipRangesColumn()`.
4441 
4442   The user MUST specify either the local or global matrix dimensions
4443   (possibly both).
4444 
4445   The parallel matrix is partitioned across processors such that the
4446   first `m0` rows belong to process 0, the next `m1` rows belong to
4447   process 1, the next `m2` rows belong to process 2, etc., where
4448   `m0`, `m1`, `m2`... are the input parameter `m` on each MPI process. I.e., each MPI process stores
4449   values corresponding to [m x N] submatrix.
4450 
4451   The columns are logically partitioned with the n0 columns belonging
4452   to 0th partition, the next n1 columns belonging to the next
4453   partition etc.. where n0,n1,n2... are the input parameter 'n'.
4454 
4455   The DIAGONAL portion of the local submatrix on any given processor
4456   is the submatrix corresponding to the rows and columns m,n
4457   corresponding to the given processor. i.e diagonal matrix on
4458   process 0 is [m0 x n0], diagonal matrix on process 1 is [m1 x n1]
4459   etc. The remaining portion of the local submatrix [m x (N-n)]
4460   constitute the OFF-DIAGONAL portion. The example below better
4461   illustrates this concept. The two matrices, the DIAGONAL portion and
4462   the OFF-DIAGONAL portion are each stored as `MATSEQAIJ` matrices.
4463 
4464   For a square global matrix we define each processor's diagonal portion
4465   to be its local rows and the corresponding columns (a square submatrix);
4466   each processor's off-diagonal portion encompasses the remainder of the
4467   local matrix (a rectangular submatrix).
4468 
4469   If `o_nnz`, `d_nnz` are specified, then `o_nz`, and `d_nz` are ignored.
4470 
4471   When calling this routine with a single process communicator, a matrix of
4472   type `MATSEQAIJ` is returned.  If a matrix of type `MATMPIAIJ` is desired for this
4473   type of communicator, use the construction mechanism
4474 .vb
4475   MatCreate(..., &A);
4476   MatSetType(A, MATMPIAIJ);
4477   MatSetSizes(A, m, n, M, N);
4478   MatMPIAIJSetPreallocation(A, ...);
4479 .ve
4480 
4481   By default, this format uses inodes (identical nodes) when possible.
4482   We search for consecutive rows with the same nonzero structure, thereby
4483   reusing matrix information to achieve increased efficiency.
4484 
4485   Example Usage:
4486   Consider the following 8x8 matrix with 34 non-zero values, that is
4487   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4488   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4489   as follows
4490 
4491 .vb
4492             1  2  0  |  0  3  0  |  0  4
4493     Proc0   0  5  6  |  7  0  0  |  8  0
4494             9  0 10  | 11  0  0  | 12  0
4495     -------------------------------------
4496            13  0 14  | 15 16 17  |  0  0
4497     Proc1   0 18  0  | 19 20 21  |  0  0
4498             0  0  0  | 22 23  0  | 24  0
4499     -------------------------------------
4500     Proc2  25 26 27  |  0  0 28  | 29  0
4501            30  0  0  | 31 32 33  |  0 34
4502 .ve
4503 
4504   This can be represented as a collection of submatrices as
4505 
4506 .vb
4507       A B C
4508       D E F
4509       G H I
4510 .ve
4511 
4512   Where the submatrices A,B,C are owned by proc0, D,E,F are
4513   owned by proc1, G,H,I are owned by proc2.
4514 
4515   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4516   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4517   The 'M','N' parameters are 8,8, and have the same values on all procs.
4518 
4519   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4520   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4521   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4522   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4523   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4524   matrix, and [DF] as another SeqAIJ matrix.
4525 
4526   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4527   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4528   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4529   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4530   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4531   In this case, the values of `d_nz`,`o_nz` are
4532 .vb
4533      proc0  dnz = 2, o_nz = 2
4534      proc1  dnz = 3, o_nz = 2
4535      proc2  dnz = 1, o_nz = 4
4536 .ve
4537   We are allocating m*(`d_nz`+`o_nz`) storage locations for every proc. This
4538   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4539   for proc3. i.e we are using 12+15+10=37 storage locations to store
4540   34 values.
4541 
4542   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4543   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4544   In the above case the values for d_nnz,o_nnz are
4545 .vb
4546      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4547      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4548      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4549 .ve
4550   Here the space allocated is sum of all the above values i.e 34, and
4551   hence pre-allocation is perfect.
4552 
4553 .seealso: [](ch_matrices), `Mat`, [Sparse Matrix Creation](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4554           `MATMPIAIJ`, `MatCreateMPIAIJWithArrays()`, `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`,
4555           `MatGetOwnershipRangesColumn()`, `PetscLayout`
4556 @*/
4557 PetscErrorCode MatCreateAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
4558 {
4559   PetscMPIInt size;
4560 
4561   PetscFunctionBegin;
4562   PetscCall(MatCreate(comm, A));
4563   PetscCall(MatSetSizes(*A, m, n, M, N));
4564   PetscCallMPI(MPI_Comm_size(comm, &size));
4565   if (size > 1) {
4566     PetscCall(MatSetType(*A, MATMPIAIJ));
4567     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
4568   } else {
4569     PetscCall(MatSetType(*A, MATSEQAIJ));
4570     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
4571   }
4572   PetscFunctionReturn(PETSC_SUCCESS);
4573 }
4574 
4575 /*MC
4576     MatMPIAIJGetSeqAIJF90 - Returns the local pieces of this distributed matrix
4577 
4578     Synopsis:
4579     MatMPIAIJGetSeqAIJF90(Mat A, Mat Ad, Mat Ao, {PetscInt, pointer :: colmap(:)},integer ierr)
4580 
4581     Not Collective
4582 
4583     Input Parameter:
4584 .   A - the `MATMPIAIJ` matrix
4585 
4586     Output Parameters:
4587 +   Ad - the diagonal portion of the matrix
4588 .   Ao - the off-diagonal portion of the matrix
4589 .   colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4590 -   ierr - error code
4591 
4592      Level: advanced
4593 
4594     Note:
4595     Use  `MatMPIAIJRestoreSeqAIJF90()` when you no longer need access to the matrices and `colmap`
4596 
4597 .seealso: [](ch_matrices), `Mat`, [](sec_fortranarrays), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJ()`, `MatMPIAIJRestoreSeqAIJF90()`
4598 M*/
4599 
4600 /*MC
4601     MatMPIAIJRestoreSeqAIJF90 - call after `MatMPIAIJGetSeqAIJF90()` when you no longer need access to the matrices and `colmap`
4602 
4603     Synopsis:
4604     MatMPIAIJRestoreSeqAIJF90(Mat A, Mat Ad, Mat Ao, {PetscInt, pointer :: colmap(:)},integer ierr)
4605 
4606     Not Collective
4607 
4608     Input Parameters:
4609 +   A - the `MATMPIAIJ` matrix
4610 .   Ad - the diagonal portion of the matrix
4611 .   Ao - the off-diagonal portion of the matrix
4612 .   colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4613 -   ierr - error code
4614 
4615      Level: advanced
4616 
4617 .seealso: [](ch_matrices), `Mat`, [](sec_fortranarrays), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJ()`, `MatMPIAIJGetSeqAIJF90()`
4618 M*/
4619 
4620 /*@C
4621   MatMPIAIJGetSeqAIJ - Returns the local pieces of this distributed matrix
4622 
4623   Not Collective
4624 
4625   Input Parameter:
4626 . A - The `MATMPIAIJ` matrix
4627 
4628   Output Parameters:
4629 + Ad     - The local diagonal block as a `MATSEQAIJ` matrix
4630 . Ao     - The local off-diagonal block as a `MATSEQAIJ` matrix
4631 - colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4632 
4633   Level: intermediate
4634 
4635   Note:
4636   The rows in `Ad` and `Ao` are in [0, Nr), where Nr is the number of local rows on this process. The columns
4637   in `Ad` are in [0, Nc) where Nc is the number of local columns. The columns are `Ao` are in [0, Nco), where Nco is
4638   the number of nonzero columns in the local off-diagonal piece of the matrix `A`. The array colmap maps these
4639   local column numbers to global column numbers in the original matrix.
4640 
4641   Fortran Notes:
4642   `MatMPIAIJGetSeqAIJ()` Fortran binding is deprecated (since PETSc 3.19), use `MatMPIAIJGetSeqAIJF90()`
4643 
4644 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJF90()`, `MatMPIAIJRestoreSeqAIJF90()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`, `MatCreateAIJ()`, `MATSEQAIJ`
4645 @*/
4646 PetscErrorCode MatMPIAIJGetSeqAIJ(Mat A, Mat *Ad, Mat *Ao, const PetscInt *colmap[])
4647 {
4648   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4649   PetscBool   flg;
4650 
4651   PetscFunctionBegin;
4652   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &flg));
4653   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "This function requires a MATMPIAIJ matrix as input");
4654   if (Ad) *Ad = a->A;
4655   if (Ao) *Ao = a->B;
4656   if (colmap) *colmap = a->garray;
4657   PetscFunctionReturn(PETSC_SUCCESS);
4658 }
4659 
4660 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
4661 {
4662   PetscInt     m, N, i, rstart, nnz, Ii;
4663   PetscInt    *indx;
4664   PetscScalar *values;
4665   MatType      rootType;
4666 
4667   PetscFunctionBegin;
4668   PetscCall(MatGetSize(inmat, &m, &N));
4669   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
4670     PetscInt *dnz, *onz, sum, bs, cbs;
4671 
4672     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
4673     /* Check sum(n) = N */
4674     PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
4675     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
4676 
4677     PetscCallMPI(MPI_Scan(&m, &rstart, 1, MPIU_INT, MPI_SUM, comm));
4678     rstart -= m;
4679 
4680     MatPreallocateBegin(comm, m, n, dnz, onz);
4681     for (i = 0; i < m; i++) {
4682       PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4683       PetscCall(MatPreallocateSet(i + rstart, nnz, indx, dnz, onz));
4684       PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4685     }
4686 
4687     PetscCall(MatCreate(comm, outmat));
4688     PetscCall(MatSetSizes(*outmat, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4689     PetscCall(MatGetBlockSizes(inmat, &bs, &cbs));
4690     PetscCall(MatSetBlockSizes(*outmat, bs, cbs));
4691     PetscCall(MatGetRootType_Private(inmat, &rootType));
4692     PetscCall(MatSetType(*outmat, rootType));
4693     PetscCall(MatSeqAIJSetPreallocation(*outmat, 0, dnz));
4694     PetscCall(MatMPIAIJSetPreallocation(*outmat, 0, dnz, 0, onz));
4695     MatPreallocateEnd(dnz, onz);
4696     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
4697   }
4698 
4699   /* numeric phase */
4700   PetscCall(MatGetOwnershipRange(*outmat, &rstart, NULL));
4701   for (i = 0; i < m; i++) {
4702     PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4703     Ii = i + rstart;
4704     PetscCall(MatSetValues(*outmat, 1, &Ii, nnz, indx, values, INSERT_VALUES));
4705     PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4706   }
4707   PetscCall(MatAssemblyBegin(*outmat, MAT_FINAL_ASSEMBLY));
4708   PetscCall(MatAssemblyEnd(*outmat, MAT_FINAL_ASSEMBLY));
4709   PetscFunctionReturn(PETSC_SUCCESS);
4710 }
4711 
4712 static PetscErrorCode MatDestroy_MPIAIJ_SeqsToMPI(void **data)
4713 {
4714   Mat_Merge_SeqsToMPI *merge = (Mat_Merge_SeqsToMPI *)*data;
4715 
4716   PetscFunctionBegin;
4717   if (!merge) PetscFunctionReturn(PETSC_SUCCESS);
4718   PetscCall(PetscFree(merge->id_r));
4719   PetscCall(PetscFree(merge->len_s));
4720   PetscCall(PetscFree(merge->len_r));
4721   PetscCall(PetscFree(merge->bi));
4722   PetscCall(PetscFree(merge->bj));
4723   PetscCall(PetscFree(merge->buf_ri[0]));
4724   PetscCall(PetscFree(merge->buf_ri));
4725   PetscCall(PetscFree(merge->buf_rj[0]));
4726   PetscCall(PetscFree(merge->buf_rj));
4727   PetscCall(PetscFree(merge->coi));
4728   PetscCall(PetscFree(merge->coj));
4729   PetscCall(PetscFree(merge->owners_co));
4730   PetscCall(PetscLayoutDestroy(&merge->rowmap));
4731   PetscCall(PetscFree(merge));
4732   PetscFunctionReturn(PETSC_SUCCESS);
4733 }
4734 
4735 #include <../src/mat/utils/freespace.h>
4736 #include <petscbt.h>
4737 
4738 PetscErrorCode MatCreateMPIAIJSumSeqAIJNumeric(Mat seqmat, Mat mpimat)
4739 {
4740   MPI_Comm             comm;
4741   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4742   PetscMPIInt          size, rank, taga, *len_s;
4743   PetscInt             N = mpimat->cmap->N, i, j, *owners, *ai = a->i, *aj, m;
4744   PetscMPIInt          proc, k;
4745   PetscInt           **buf_ri, **buf_rj;
4746   PetscInt             anzi, *bj_i, *bi, *bj, arow, bnzi, nextaj;
4747   PetscInt             nrows, **buf_ri_k, **nextrow, **nextai;
4748   MPI_Request         *s_waits, *r_waits;
4749   MPI_Status          *status;
4750   const MatScalar     *aa, *a_a;
4751   MatScalar          **abuf_r, *ba_i;
4752   Mat_Merge_SeqsToMPI *merge;
4753   PetscContainer       container;
4754 
4755   PetscFunctionBegin;
4756   PetscCall(PetscObjectGetComm((PetscObject)mpimat, &comm));
4757   PetscCall(PetscLogEventBegin(MAT_Seqstompinum, seqmat, 0, 0, 0));
4758 
4759   PetscCallMPI(MPI_Comm_size(comm, &size));
4760   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4761 
4762   PetscCall(PetscObjectQuery((PetscObject)mpimat, "MatMergeSeqsToMPI", (PetscObject *)&container));
4763   PetscCheck(container, PetscObjectComm((PetscObject)mpimat), PETSC_ERR_PLIB, "Mat not created from MatCreateMPIAIJSumSeqAIJSymbolic");
4764   PetscCall(PetscContainerGetPointer(container, (void **)&merge));
4765   PetscCall(MatSeqAIJGetArrayRead(seqmat, &a_a));
4766   aa = a_a;
4767 
4768   bi     = merge->bi;
4769   bj     = merge->bj;
4770   buf_ri = merge->buf_ri;
4771   buf_rj = merge->buf_rj;
4772 
4773   PetscCall(PetscMalloc1(size, &status));
4774   owners = merge->rowmap->range;
4775   len_s  = merge->len_s;
4776 
4777   /* send and recv matrix values */
4778   PetscCall(PetscObjectGetNewTag((PetscObject)mpimat, &taga));
4779   PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits));
4780 
4781   PetscCall(PetscMalloc1(merge->nsend + 1, &s_waits));
4782   for (proc = 0, k = 0; proc < size; proc++) {
4783     if (!len_s[proc]) continue;
4784     i = owners[proc];
4785     PetscCallMPI(MPIU_Isend(aa + ai[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k));
4786     k++;
4787   }
4788 
4789   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status));
4790   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status));
4791   PetscCall(PetscFree(status));
4792 
4793   PetscCall(PetscFree(s_waits));
4794   PetscCall(PetscFree(r_waits));
4795 
4796   /* insert mat values of mpimat */
4797   PetscCall(PetscMalloc1(N, &ba_i));
4798   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4799 
4800   for (k = 0; k < merge->nrecv; k++) {
4801     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4802     nrows       = *buf_ri_k[k];
4803     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4804     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4805   }
4806 
4807   /* set values of ba */
4808   m = merge->rowmap->n;
4809   for (i = 0; i < m; i++) {
4810     arow = owners[rank] + i;
4811     bj_i = bj + bi[i]; /* col indices of the i-th row of mpimat */
4812     bnzi = bi[i + 1] - bi[i];
4813     PetscCall(PetscArrayzero(ba_i, bnzi));
4814 
4815     /* add local non-zero vals of this proc's seqmat into ba */
4816     anzi   = ai[arow + 1] - ai[arow];
4817     aj     = a->j + ai[arow];
4818     aa     = a_a + ai[arow];
4819     nextaj = 0;
4820     for (j = 0; nextaj < anzi; j++) {
4821       if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4822         ba_i[j] += aa[nextaj++];
4823       }
4824     }
4825 
4826     /* add received vals into ba */
4827     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4828       /* i-th row */
4829       if (i == *nextrow[k]) {
4830         anzi   = *(nextai[k] + 1) - *nextai[k];
4831         aj     = buf_rj[k] + *nextai[k];
4832         aa     = abuf_r[k] + *nextai[k];
4833         nextaj = 0;
4834         for (j = 0; nextaj < anzi; j++) {
4835           if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4836             ba_i[j] += aa[nextaj++];
4837           }
4838         }
4839         nextrow[k]++;
4840         nextai[k]++;
4841       }
4842     }
4843     PetscCall(MatSetValues(mpimat, 1, &arow, bnzi, bj_i, ba_i, INSERT_VALUES));
4844   }
4845   PetscCall(MatSeqAIJRestoreArrayRead(seqmat, &a_a));
4846   PetscCall(MatAssemblyBegin(mpimat, MAT_FINAL_ASSEMBLY));
4847   PetscCall(MatAssemblyEnd(mpimat, MAT_FINAL_ASSEMBLY));
4848 
4849   PetscCall(PetscFree(abuf_r[0]));
4850   PetscCall(PetscFree(abuf_r));
4851   PetscCall(PetscFree(ba_i));
4852   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4853   PetscCall(PetscLogEventEnd(MAT_Seqstompinum, seqmat, 0, 0, 0));
4854   PetscFunctionReturn(PETSC_SUCCESS);
4855 }
4856 
4857 PetscErrorCode MatCreateMPIAIJSumSeqAIJSymbolic(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, Mat *mpimat)
4858 {
4859   Mat                  B_mpi;
4860   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4861   PetscMPIInt          size, rank, tagi, tagj, *len_s, *len_si, *len_ri;
4862   PetscInt           **buf_rj, **buf_ri, **buf_ri_k;
4863   PetscInt             M = seqmat->rmap->n, N = seqmat->cmap->n, i, *owners, *ai = a->i, *aj = a->j;
4864   PetscInt             len, *dnz, *onz, bs, cbs;
4865   PetscInt             k, anzi, *bi, *bj, *lnk, nlnk, arow, bnzi;
4866   PetscInt             nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextai;
4867   MPI_Request         *si_waits, *sj_waits, *ri_waits, *rj_waits;
4868   MPI_Status          *status;
4869   PetscFreeSpaceList   free_space = NULL, current_space = NULL;
4870   PetscBT              lnkbt;
4871   Mat_Merge_SeqsToMPI *merge;
4872   PetscContainer       container;
4873 
4874   PetscFunctionBegin;
4875   PetscCall(PetscLogEventBegin(MAT_Seqstompisym, seqmat, 0, 0, 0));
4876 
4877   /* make sure it is a PETSc comm */
4878   PetscCall(PetscCommDuplicate(comm, &comm, NULL));
4879   PetscCallMPI(MPI_Comm_size(comm, &size));
4880   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4881 
4882   PetscCall(PetscNew(&merge));
4883   PetscCall(PetscMalloc1(size, &status));
4884 
4885   /* determine row ownership */
4886   PetscCall(PetscLayoutCreate(comm, &merge->rowmap));
4887   PetscCall(PetscLayoutSetLocalSize(merge->rowmap, m));
4888   PetscCall(PetscLayoutSetSize(merge->rowmap, M));
4889   PetscCall(PetscLayoutSetBlockSize(merge->rowmap, 1));
4890   PetscCall(PetscLayoutSetUp(merge->rowmap));
4891   PetscCall(PetscMalloc1(size, &len_si));
4892   PetscCall(PetscMalloc1(size, &merge->len_s));
4893 
4894   m      = merge->rowmap->n;
4895   owners = merge->rowmap->range;
4896 
4897   /* determine the number of messages to send, their lengths */
4898   len_s = merge->len_s;
4899 
4900   len          = 0; /* length of buf_si[] */
4901   merge->nsend = 0;
4902   for (PetscMPIInt proc = 0; proc < size; proc++) {
4903     len_si[proc] = 0;
4904     if (proc == rank) {
4905       len_s[proc] = 0;
4906     } else {
4907       PetscCall(PetscMPIIntCast(owners[proc + 1] - owners[proc] + 1, &len_si[proc]));
4908       PetscCall(PetscMPIIntCast(ai[owners[proc + 1]] - ai[owners[proc]], &len_s[proc])); /* num of rows to be sent to [proc] */
4909     }
4910     if (len_s[proc]) {
4911       merge->nsend++;
4912       nrows = 0;
4913       for (i = owners[proc]; i < owners[proc + 1]; i++) {
4914         if (ai[i + 1] > ai[i]) nrows++;
4915       }
4916       PetscCall(PetscMPIIntCast(2 * (nrows + 1), &len_si[proc]));
4917       len += len_si[proc];
4918     }
4919   }
4920 
4921   /* determine the number and length of messages to receive for ij-structure */
4922   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv));
4923   PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri));
4924 
4925   /* post the Irecv of j-structure */
4926   PetscCall(PetscCommGetNewTag(comm, &tagj));
4927   PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rj_waits));
4928 
4929   /* post the Isend of j-structure */
4930   PetscCall(PetscMalloc2(merge->nsend, &si_waits, merge->nsend, &sj_waits));
4931 
4932   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4933     if (!len_s[proc]) continue;
4934     i = owners[proc];
4935     PetscCallMPI(MPIU_Isend(aj + ai[i], len_s[proc], MPIU_INT, proc, tagj, comm, sj_waits + k));
4936     k++;
4937   }
4938 
4939   /* receives and sends of j-structure are complete */
4940   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, rj_waits, status));
4941   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, sj_waits, status));
4942 
4943   /* send and recv i-structure */
4944   PetscCall(PetscCommGetNewTag(comm, &tagi));
4945   PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &ri_waits));
4946 
4947   PetscCall(PetscMalloc1(len + 1, &buf_s));
4948   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
4949   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4950     if (!len_s[proc]) continue;
4951     /* form outgoing message for i-structure:
4952          buf_si[0]:                 nrows to be sent
4953                [1:nrows]:           row index (global)
4954                [nrows+1:2*nrows+1]: i-structure index
4955     */
4956     nrows       = len_si[proc] / 2 - 1;
4957     buf_si_i    = buf_si + nrows + 1;
4958     buf_si[0]   = nrows;
4959     buf_si_i[0] = 0;
4960     nrows       = 0;
4961     for (i = owners[proc]; i < owners[proc + 1]; i++) {
4962       anzi = ai[i + 1] - ai[i];
4963       if (anzi) {
4964         buf_si_i[nrows + 1] = buf_si_i[nrows] + anzi; /* i-structure */
4965         buf_si[nrows + 1]   = i - owners[proc];       /* local row index */
4966         nrows++;
4967       }
4968     }
4969     PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, si_waits + k));
4970     k++;
4971     buf_si += len_si[proc];
4972   }
4973 
4974   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, ri_waits, status));
4975   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, si_waits, status));
4976 
4977   PetscCall(PetscInfo(seqmat, "nsend: %d, nrecv: %d\n", merge->nsend, merge->nrecv));
4978   for (i = 0; i < merge->nrecv; i++) PetscCall(PetscInfo(seqmat, "recv len_ri=%d, len_rj=%d from [%d]\n", len_ri[i], merge->len_r[i], merge->id_r[i]));
4979 
4980   PetscCall(PetscFree(len_si));
4981   PetscCall(PetscFree(len_ri));
4982   PetscCall(PetscFree(rj_waits));
4983   PetscCall(PetscFree2(si_waits, sj_waits));
4984   PetscCall(PetscFree(ri_waits));
4985   PetscCall(PetscFree(buf_s));
4986   PetscCall(PetscFree(status));
4987 
4988   /* compute a local seq matrix in each processor */
4989   /* allocate bi array and free space for accumulating nonzero column info */
4990   PetscCall(PetscMalloc1(m + 1, &bi));
4991   bi[0] = 0;
4992 
4993   /* create and initialize a linked list */
4994   nlnk = N + 1;
4995   PetscCall(PetscLLCreate(N, N, nlnk, lnk, lnkbt));
4996 
4997   /* initial FreeSpace size is 2*(num of local nnz(seqmat)) */
4998   len = ai[owners[rank + 1]] - ai[owners[rank]];
4999   PetscCall(PetscFreeSpaceGet(PetscIntMultTruncate(2, len) + 1, &free_space));
5000 
5001   current_space = free_space;
5002 
5003   /* determine symbolic info for each local row */
5004   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
5005 
5006   for (k = 0; k < merge->nrecv; k++) {
5007     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
5008     nrows       = *buf_ri_k[k];
5009     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
5010     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
5011   }
5012 
5013   MatPreallocateBegin(comm, m, n, dnz, onz);
5014   len = 0;
5015   for (i = 0; i < m; i++) {
5016     bnzi = 0;
5017     /* add local non-zero cols of this proc's seqmat into lnk */
5018     arow = owners[rank] + i;
5019     anzi = ai[arow + 1] - ai[arow];
5020     aj   = a->j + ai[arow];
5021     PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
5022     bnzi += nlnk;
5023     /* add received col data into lnk */
5024     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
5025       if (i == *nextrow[k]) {            /* i-th row */
5026         anzi = *(nextai[k] + 1) - *nextai[k];
5027         aj   = buf_rj[k] + *nextai[k];
5028         PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
5029         bnzi += nlnk;
5030         nextrow[k]++;
5031         nextai[k]++;
5032       }
5033     }
5034     if (len < bnzi) len = bnzi; /* =max(bnzi) */
5035 
5036     /* if free space is not available, make more free space */
5037     if (current_space->local_remaining < bnzi) PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(bnzi, current_space->total_array_size), &current_space));
5038     /* copy data into free space, then initialize lnk */
5039     PetscCall(PetscLLClean(N, N, bnzi, lnk, current_space->array, lnkbt));
5040     PetscCall(MatPreallocateSet(i + owners[rank], bnzi, current_space->array, dnz, onz));
5041 
5042     current_space->array += bnzi;
5043     current_space->local_used += bnzi;
5044     current_space->local_remaining -= bnzi;
5045 
5046     bi[i + 1] = bi[i] + bnzi;
5047   }
5048 
5049   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
5050 
5051   PetscCall(PetscMalloc1(bi[m] + 1, &bj));
5052   PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
5053   PetscCall(PetscLLDestroy(lnk, lnkbt));
5054 
5055   /* create symbolic parallel matrix B_mpi */
5056   PetscCall(MatGetBlockSizes(seqmat, &bs, &cbs));
5057   PetscCall(MatCreate(comm, &B_mpi));
5058   if (n == PETSC_DECIDE) {
5059     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, N));
5060   } else {
5061     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
5062   }
5063   PetscCall(MatSetBlockSizes(B_mpi, bs, cbs));
5064   PetscCall(MatSetType(B_mpi, MATMPIAIJ));
5065   PetscCall(MatMPIAIJSetPreallocation(B_mpi, 0, dnz, 0, onz));
5066   MatPreallocateEnd(dnz, onz);
5067   PetscCall(MatSetOption(B_mpi, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
5068 
5069   /* B_mpi is not ready for use - assembly will be done by MatCreateMPIAIJSumSeqAIJNumeric() */
5070   B_mpi->assembled = PETSC_FALSE;
5071   merge->bi        = bi;
5072   merge->bj        = bj;
5073   merge->buf_ri    = buf_ri;
5074   merge->buf_rj    = buf_rj;
5075   merge->coi       = NULL;
5076   merge->coj       = NULL;
5077   merge->owners_co = NULL;
5078 
5079   PetscCall(PetscCommDestroy(&comm));
5080 
5081   /* attach the supporting struct to B_mpi for reuse */
5082   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
5083   PetscCall(PetscContainerSetPointer(container, merge));
5084   PetscCall(PetscContainerSetCtxDestroy(container, MatDestroy_MPIAIJ_SeqsToMPI));
5085   PetscCall(PetscObjectCompose((PetscObject)B_mpi, "MatMergeSeqsToMPI", (PetscObject)container));
5086   PetscCall(PetscContainerDestroy(&container));
5087   *mpimat = B_mpi;
5088 
5089   PetscCall(PetscLogEventEnd(MAT_Seqstompisym, seqmat, 0, 0, 0));
5090   PetscFunctionReturn(PETSC_SUCCESS);
5091 }
5092 
5093 /*@
5094   MatCreateMPIAIJSumSeqAIJ - Creates a `MATMPIAIJ` matrix by adding sequential
5095   matrices from each processor
5096 
5097   Collective
5098 
5099   Input Parameters:
5100 + comm   - the communicators the parallel matrix will live on
5101 . seqmat - the input sequential matrices
5102 . m      - number of local rows (or `PETSC_DECIDE`)
5103 . n      - number of local columns (or `PETSC_DECIDE`)
5104 - scall  - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5105 
5106   Output Parameter:
5107 . mpimat - the parallel matrix generated
5108 
5109   Level: advanced
5110 
5111   Note:
5112   The dimensions of the sequential matrix in each processor MUST be the same.
5113   The input seqmat is included into the container "Mat_Merge_SeqsToMPI", and will be
5114   destroyed when `mpimat` is destroyed. Call `PetscObjectQuery()` to access `seqmat`.
5115 
5116 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJ()`
5117 @*/
5118 PetscErrorCode MatCreateMPIAIJSumSeqAIJ(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, MatReuse scall, Mat *mpimat)
5119 {
5120   PetscMPIInt size;
5121 
5122   PetscFunctionBegin;
5123   PetscCallMPI(MPI_Comm_size(comm, &size));
5124   if (size == 1) {
5125     PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5126     if (scall == MAT_INITIAL_MATRIX) {
5127       PetscCall(MatDuplicate(seqmat, MAT_COPY_VALUES, mpimat));
5128     } else {
5129       PetscCall(MatCopy(seqmat, *mpimat, SAME_NONZERO_PATTERN));
5130     }
5131     PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5132     PetscFunctionReturn(PETSC_SUCCESS);
5133   }
5134   PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5135   if (scall == MAT_INITIAL_MATRIX) PetscCall(MatCreateMPIAIJSumSeqAIJSymbolic(comm, seqmat, m, n, mpimat));
5136   PetscCall(MatCreateMPIAIJSumSeqAIJNumeric(seqmat, *mpimat));
5137   PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5138   PetscFunctionReturn(PETSC_SUCCESS);
5139 }
5140 
5141 /*@
5142   MatAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATAIJ` matrix.
5143 
5144   Not Collective
5145 
5146   Input Parameter:
5147 . A - the matrix
5148 
5149   Output Parameter:
5150 . A_loc - the local sequential matrix generated
5151 
5152   Level: developer
5153 
5154   Notes:
5155   The matrix is created by taking `A`'s local rows and putting them into a sequential matrix
5156   with `mlocal` rows and `n` columns. Where `mlocal` is obtained with `MatGetLocalSize()` and
5157   `n` is the global column count obtained with `MatGetSize()`
5158 
5159   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5160 
5161   For parallel matrices this creates an entirely new matrix. If the matrix is sequential it merely increases the reference count.
5162 
5163   Destroy the matrix with `MatDestroy()`
5164 
5165 .seealso: [](ch_matrices), `Mat`, `MatMPIAIJGetLocalMat()`
5166 @*/
5167 PetscErrorCode MatAIJGetLocalMat(Mat A, Mat *A_loc)
5168 {
5169   PetscBool mpi;
5170 
5171   PetscFunctionBegin;
5172   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &mpi));
5173   if (mpi) {
5174     PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, A_loc));
5175   } else {
5176     *A_loc = A;
5177     PetscCall(PetscObjectReference((PetscObject)*A_loc));
5178   }
5179   PetscFunctionReturn(PETSC_SUCCESS);
5180 }
5181 
5182 /*@
5183   MatMPIAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix.
5184 
5185   Not Collective
5186 
5187   Input Parameters:
5188 + A     - the matrix
5189 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5190 
5191   Output Parameter:
5192 . A_loc - the local sequential matrix generated
5193 
5194   Level: developer
5195 
5196   Notes:
5197   The matrix is created by taking all `A`'s local rows and putting them into a sequential
5198   matrix with `mlocal` rows and `n` columns.`mlocal` is the row count obtained with
5199   `MatGetLocalSize()` and `n` is the global column count obtained with `MatGetSize()`.
5200 
5201   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5202 
5203   When `A` is sequential and `MAT_INITIAL_MATRIX` is requested, the matrix returned is the diagonal part of `A` (which contains the entire matrix),
5204   with its reference count increased by one. Hence changing values of `A_loc` changes `A`. If `MAT_REUSE_MATRIX` is requested on a sequential matrix
5205   then `MatCopy`(Adiag,*`A_loc`,`SAME_NONZERO_PATTERN`) is called to fill `A_loc`. Thus one can preallocate the appropriate sequential matrix `A_loc`
5206   and then call this routine with `MAT_REUSE_MATRIX`. In this case, one can modify the values of `A_loc` without affecting the original sequential matrix.
5207 
5208 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMatCondensed()`, `MatMPIAIJGetLocalMatMerge()`
5209 @*/
5210 PetscErrorCode MatMPIAIJGetLocalMat(Mat A, MatReuse scall, Mat *A_loc)
5211 {
5212   Mat_MPIAIJ        *mpimat = (Mat_MPIAIJ *)A->data;
5213   Mat_SeqAIJ        *mat, *a, *b;
5214   PetscInt          *ai, *aj, *bi, *bj, *cmap = mpimat->garray;
5215   const PetscScalar *aa, *ba, *aav, *bav;
5216   PetscScalar       *ca, *cam;
5217   PetscMPIInt        size;
5218   PetscInt           am = A->rmap->n, i, j, k, cstart = A->cmap->rstart;
5219   PetscInt          *ci, *cj, col, ncols_d, ncols_o, jo;
5220   PetscBool          match;
5221 
5222   PetscFunctionBegin;
5223   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &match));
5224   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5225   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5226   if (size == 1) {
5227     if (scall == MAT_INITIAL_MATRIX) {
5228       PetscCall(PetscObjectReference((PetscObject)mpimat->A));
5229       *A_loc = mpimat->A;
5230     } else if (scall == MAT_REUSE_MATRIX) {
5231       PetscCall(MatCopy(mpimat->A, *A_loc, SAME_NONZERO_PATTERN));
5232     }
5233     PetscFunctionReturn(PETSC_SUCCESS);
5234   }
5235 
5236   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5237   a  = (Mat_SeqAIJ *)mpimat->A->data;
5238   b  = (Mat_SeqAIJ *)mpimat->B->data;
5239   ai = a->i;
5240   aj = a->j;
5241   bi = b->i;
5242   bj = b->j;
5243   PetscCall(MatSeqAIJGetArrayRead(mpimat->A, &aav));
5244   PetscCall(MatSeqAIJGetArrayRead(mpimat->B, &bav));
5245   aa = aav;
5246   ba = bav;
5247   if (scall == MAT_INITIAL_MATRIX) {
5248     PetscCall(PetscMalloc1(1 + am, &ci));
5249     ci[0] = 0;
5250     for (i = 0; i < am; i++) ci[i + 1] = ci[i] + (ai[i + 1] - ai[i]) + (bi[i + 1] - bi[i]);
5251     PetscCall(PetscMalloc1(1 + ci[am], &cj));
5252     PetscCall(PetscMalloc1(1 + ci[am], &ca));
5253     k = 0;
5254     for (i = 0; i < am; i++) {
5255       ncols_o = bi[i + 1] - bi[i];
5256       ncols_d = ai[i + 1] - ai[i];
5257       /* off-diagonal portion of A */
5258       for (jo = 0; jo < ncols_o; jo++) {
5259         col = cmap[*bj];
5260         if (col >= cstart) break;
5261         cj[k] = col;
5262         bj++;
5263         ca[k++] = *ba++;
5264       }
5265       /* diagonal portion of A */
5266       for (j = 0; j < ncols_d; j++) {
5267         cj[k]   = cstart + *aj++;
5268         ca[k++] = *aa++;
5269       }
5270       /* off-diagonal portion of A */
5271       for (j = jo; j < ncols_o; j++) {
5272         cj[k]   = cmap[*bj++];
5273         ca[k++] = *ba++;
5274       }
5275     }
5276     /* put together the new matrix */
5277     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, A->cmap->N, ci, cj, ca, A_loc));
5278     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5279     /* Since these are PETSc arrays, change flags to free them as necessary. */
5280     mat          = (Mat_SeqAIJ *)(*A_loc)->data;
5281     mat->free_a  = PETSC_TRUE;
5282     mat->free_ij = PETSC_TRUE;
5283     mat->nonew   = 0;
5284   } else if (scall == MAT_REUSE_MATRIX) {
5285     mat = (Mat_SeqAIJ *)(*A_loc)->data;
5286     ci  = mat->i;
5287     cj  = mat->j;
5288     PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &cam));
5289     for (i = 0; i < am; i++) {
5290       /* off-diagonal portion of A */
5291       ncols_o = bi[i + 1] - bi[i];
5292       for (jo = 0; jo < ncols_o; jo++) {
5293         col = cmap[*bj];
5294         if (col >= cstart) break;
5295         *cam++ = *ba++;
5296         bj++;
5297       }
5298       /* diagonal portion of A */
5299       ncols_d = ai[i + 1] - ai[i];
5300       for (j = 0; j < ncols_d; j++) *cam++ = *aa++;
5301       /* off-diagonal portion of A */
5302       for (j = jo; j < ncols_o; j++) {
5303         *cam++ = *ba++;
5304         bj++;
5305       }
5306     }
5307     PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &cam));
5308   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5309   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->A, &aav));
5310   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->B, &bav));
5311   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5312   PetscFunctionReturn(PETSC_SUCCESS);
5313 }
5314 
5315 /*@
5316   MatMPIAIJGetLocalMatMerge - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix by taking all its local rows and putting them into a sequential matrix with
5317   mlocal rows and n columns. Where n is the sum of the number of columns of the diagonal and off-diagonal part
5318 
5319   Not Collective
5320 
5321   Input Parameters:
5322 + A     - the matrix
5323 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5324 
5325   Output Parameters:
5326 + glob  - sequential `IS` with global indices associated with the columns of the local sequential matrix generated (can be `NULL`)
5327 - A_loc - the local sequential matrix generated
5328 
5329   Level: developer
5330 
5331   Note:
5332   This is different from `MatMPIAIJGetLocalMat()` since the first columns in the returning matrix are those associated with the diagonal
5333   part, then those associated with the off-diagonal part (in its local ordering)
5334 
5335 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`
5336 @*/
5337 PetscErrorCode MatMPIAIJGetLocalMatMerge(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
5338 {
5339   Mat             Ao, Ad;
5340   const PetscInt *cmap;
5341   PetscMPIInt     size;
5342   PetscErrorCode (*f)(Mat, MatReuse, IS *, Mat *);
5343 
5344   PetscFunctionBegin;
5345   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
5346   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5347   if (size == 1) {
5348     if (scall == MAT_INITIAL_MATRIX) {
5349       PetscCall(PetscObjectReference((PetscObject)Ad));
5350       *A_loc = Ad;
5351     } else if (scall == MAT_REUSE_MATRIX) {
5352       PetscCall(MatCopy(Ad, *A_loc, SAME_NONZERO_PATTERN));
5353     }
5354     if (glob) PetscCall(ISCreateStride(PetscObjectComm((PetscObject)Ad), Ad->cmap->n, Ad->cmap->rstart, 1, glob));
5355     PetscFunctionReturn(PETSC_SUCCESS);
5356   }
5357   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", &f));
5358   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5359   if (f) {
5360     PetscCall((*f)(A, scall, glob, A_loc));
5361   } else {
5362     Mat_SeqAIJ        *a = (Mat_SeqAIJ *)Ad->data;
5363     Mat_SeqAIJ        *b = (Mat_SeqAIJ *)Ao->data;
5364     Mat_SeqAIJ        *c;
5365     PetscInt          *ai = a->i, *aj = a->j;
5366     PetscInt          *bi = b->i, *bj = b->j;
5367     PetscInt          *ci, *cj;
5368     const PetscScalar *aa, *ba;
5369     PetscScalar       *ca;
5370     PetscInt           i, j, am, dn, on;
5371 
5372     PetscCall(MatGetLocalSize(Ad, &am, &dn));
5373     PetscCall(MatGetLocalSize(Ao, NULL, &on));
5374     PetscCall(MatSeqAIJGetArrayRead(Ad, &aa));
5375     PetscCall(MatSeqAIJGetArrayRead(Ao, &ba));
5376     if (scall == MAT_INITIAL_MATRIX) {
5377       PetscInt k;
5378       PetscCall(PetscMalloc1(1 + am, &ci));
5379       PetscCall(PetscMalloc1(ai[am] + bi[am], &cj));
5380       PetscCall(PetscMalloc1(ai[am] + bi[am], &ca));
5381       ci[0] = 0;
5382       for (i = 0, k = 0; i < am; i++) {
5383         const PetscInt ncols_o = bi[i + 1] - bi[i];
5384         const PetscInt ncols_d = ai[i + 1] - ai[i];
5385         ci[i + 1]              = ci[i] + ncols_o + ncols_d;
5386         /* diagonal portion of A */
5387         for (j = 0; j < ncols_d; j++, k++) {
5388           cj[k] = *aj++;
5389           ca[k] = *aa++;
5390         }
5391         /* off-diagonal portion of A */
5392         for (j = 0; j < ncols_o; j++, k++) {
5393           cj[k] = dn + *bj++;
5394           ca[k] = *ba++;
5395         }
5396       }
5397       /* put together the new matrix */
5398       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, dn + on, ci, cj, ca, A_loc));
5399       /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5400       /* Since these are PETSc arrays, change flags to free them as necessary. */
5401       c          = (Mat_SeqAIJ *)(*A_loc)->data;
5402       c->free_a  = PETSC_TRUE;
5403       c->free_ij = PETSC_TRUE;
5404       c->nonew   = 0;
5405       PetscCall(MatSetType(*A_loc, ((PetscObject)Ad)->type_name));
5406     } else if (scall == MAT_REUSE_MATRIX) {
5407       PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &ca));
5408       for (i = 0; i < am; i++) {
5409         const PetscInt ncols_d = ai[i + 1] - ai[i];
5410         const PetscInt ncols_o = bi[i + 1] - bi[i];
5411         /* diagonal portion of A */
5412         for (j = 0; j < ncols_d; j++) *ca++ = *aa++;
5413         /* off-diagonal portion of A */
5414         for (j = 0; j < ncols_o; j++) *ca++ = *ba++;
5415       }
5416       PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &ca));
5417     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5418     PetscCall(MatSeqAIJRestoreArrayRead(Ad, &aa));
5419     PetscCall(MatSeqAIJRestoreArrayRead(Ao, &aa));
5420     if (glob) {
5421       PetscInt cst, *gidx;
5422 
5423       PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
5424       PetscCall(PetscMalloc1(dn + on, &gidx));
5425       for (i = 0; i < dn; i++) gidx[i] = cst + i;
5426       for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
5427       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
5428     }
5429   }
5430   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5431   PetscFunctionReturn(PETSC_SUCCESS);
5432 }
5433 
5434 /*@C
5435   MatMPIAIJGetLocalMatCondensed - Creates a `MATSEQAIJ` matrix from an `MATMPIAIJ` matrix by taking all its local rows and NON-ZERO columns
5436 
5437   Not Collective
5438 
5439   Input Parameters:
5440 + A     - the matrix
5441 . scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5442 . row   - index set of rows to extract (or `NULL`)
5443 - col   - index set of columns to extract (or `NULL`)
5444 
5445   Output Parameter:
5446 . A_loc - the local sequential matrix generated
5447 
5448   Level: developer
5449 
5450 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`
5451 @*/
5452 PetscErrorCode MatMPIAIJGetLocalMatCondensed(Mat A, MatReuse scall, IS *row, IS *col, Mat *A_loc)
5453 {
5454   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5455   PetscInt    i, start, end, ncols, nzA, nzB, *cmap, imark, *idx;
5456   IS          isrowa, iscola;
5457   Mat        *aloc;
5458   PetscBool   match;
5459 
5460   PetscFunctionBegin;
5461   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &match));
5462   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5463   PetscCall(PetscLogEventBegin(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5464   if (!row) {
5465     start = A->rmap->rstart;
5466     end   = A->rmap->rend;
5467     PetscCall(ISCreateStride(PETSC_COMM_SELF, end - start, start, 1, &isrowa));
5468   } else {
5469     isrowa = *row;
5470   }
5471   if (!col) {
5472     start = A->cmap->rstart;
5473     cmap  = a->garray;
5474     nzA   = a->A->cmap->n;
5475     nzB   = a->B->cmap->n;
5476     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5477     ncols = 0;
5478     for (i = 0; i < nzB; i++) {
5479       if (cmap[i] < start) idx[ncols++] = cmap[i];
5480       else break;
5481     }
5482     imark = i;
5483     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;
5484     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i];
5485     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &iscola));
5486   } else {
5487     iscola = *col;
5488   }
5489   if (scall != MAT_INITIAL_MATRIX) {
5490     PetscCall(PetscMalloc1(1, &aloc));
5491     aloc[0] = *A_loc;
5492   }
5493   PetscCall(MatCreateSubMatrices(A, 1, &isrowa, &iscola, scall, &aloc));
5494   if (!col) { /* attach global id of condensed columns */
5495     PetscCall(PetscObjectCompose((PetscObject)aloc[0], "_petsc_GetLocalMatCondensed_iscol", (PetscObject)iscola));
5496   }
5497   *A_loc = aloc[0];
5498   PetscCall(PetscFree(aloc));
5499   if (!row) PetscCall(ISDestroy(&isrowa));
5500   if (!col) PetscCall(ISDestroy(&iscola));
5501   PetscCall(PetscLogEventEnd(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5502   PetscFunctionReturn(PETSC_SUCCESS);
5503 }
5504 
5505 /*
5506  * Create a sequential AIJ matrix based on row indices. a whole column is extracted once a row is matched.
5507  * Row could be local or remote.The routine is designed to be scalable in memory so that nothing is based
5508  * on a global size.
5509  * */
5510 static PetscErrorCode MatCreateSeqSubMatrixWithRows_Private(Mat P, IS rows, Mat *P_oth)
5511 {
5512   Mat_MPIAIJ            *p  = (Mat_MPIAIJ *)P->data;
5513   Mat_SeqAIJ            *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data, *p_oth;
5514   PetscInt               plocalsize, nrows, *ilocal, *oilocal, i, lidx, *nrcols, *nlcols, ncol;
5515   PetscMPIInt            owner;
5516   PetscSFNode           *iremote, *oiremote;
5517   const PetscInt        *lrowindices;
5518   PetscSF                sf, osf;
5519   PetscInt               pcstart, *roffsets, *loffsets, *pnnz, j;
5520   PetscInt               ontotalcols, dntotalcols, ntotalcols, nout;
5521   MPI_Comm               comm;
5522   ISLocalToGlobalMapping mapping;
5523   const PetscScalar     *pd_a, *po_a;
5524 
5525   PetscFunctionBegin;
5526   PetscCall(PetscObjectGetComm((PetscObject)P, &comm));
5527   /* plocalsize is the number of roots
5528    * nrows is the number of leaves
5529    * */
5530   PetscCall(MatGetLocalSize(P, &plocalsize, NULL));
5531   PetscCall(ISGetLocalSize(rows, &nrows));
5532   PetscCall(PetscCalloc1(nrows, &iremote));
5533   PetscCall(ISGetIndices(rows, &lrowindices));
5534   for (i = 0; i < nrows; i++) {
5535     /* Find a remote index and an owner for a row
5536      * The row could be local or remote
5537      * */
5538     owner = 0;
5539     lidx  = 0;
5540     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, &lidx));
5541     iremote[i].index = lidx;
5542     iremote[i].rank  = owner;
5543   }
5544   /* Create SF to communicate how many nonzero columns for each row */
5545   PetscCall(PetscSFCreate(comm, &sf));
5546   /* SF will figure out the number of nonzero columns for each row, and their
5547    * offsets
5548    * */
5549   PetscCall(PetscSFSetGraph(sf, plocalsize, nrows, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5550   PetscCall(PetscSFSetFromOptions(sf));
5551   PetscCall(PetscSFSetUp(sf));
5552 
5553   PetscCall(PetscCalloc1(2 * (plocalsize + 1), &roffsets));
5554   PetscCall(PetscCalloc1(2 * plocalsize, &nrcols));
5555   PetscCall(PetscCalloc1(nrows, &pnnz));
5556   roffsets[0] = 0;
5557   roffsets[1] = 0;
5558   for (i = 0; i < plocalsize; i++) {
5559     /* diagonal */
5560     nrcols[i * 2 + 0] = pd->i[i + 1] - pd->i[i];
5561     /* off-diagonal */
5562     nrcols[i * 2 + 1] = po->i[i + 1] - po->i[i];
5563     /* compute offsets so that we relative location for each row */
5564     roffsets[(i + 1) * 2 + 0] = roffsets[i * 2 + 0] + nrcols[i * 2 + 0];
5565     roffsets[(i + 1) * 2 + 1] = roffsets[i * 2 + 1] + nrcols[i * 2 + 1];
5566   }
5567   PetscCall(PetscCalloc1(2 * nrows, &nlcols));
5568   PetscCall(PetscCalloc1(2 * nrows, &loffsets));
5569   /* 'r' means root, and 'l' means leaf */
5570   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5571   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5572   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5573   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5574   PetscCall(PetscSFDestroy(&sf));
5575   PetscCall(PetscFree(roffsets));
5576   PetscCall(PetscFree(nrcols));
5577   dntotalcols = 0;
5578   ontotalcols = 0;
5579   ncol        = 0;
5580   for (i = 0; i < nrows; i++) {
5581     pnnz[i] = nlcols[i * 2 + 0] + nlcols[i * 2 + 1];
5582     ncol    = PetscMax(pnnz[i], ncol);
5583     /* diagonal */
5584     dntotalcols += nlcols[i * 2 + 0];
5585     /* off-diagonal */
5586     ontotalcols += nlcols[i * 2 + 1];
5587   }
5588   /* We do not need to figure the right number of columns
5589    * since all the calculations will be done by going through the raw data
5590    * */
5591   PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, nrows, ncol, 0, pnnz, P_oth));
5592   PetscCall(MatSetUp(*P_oth));
5593   PetscCall(PetscFree(pnnz));
5594   p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5595   /* diagonal */
5596   PetscCall(PetscCalloc1(dntotalcols, &iremote));
5597   /* off-diagonal */
5598   PetscCall(PetscCalloc1(ontotalcols, &oiremote));
5599   /* diagonal */
5600   PetscCall(PetscCalloc1(dntotalcols, &ilocal));
5601   /* off-diagonal */
5602   PetscCall(PetscCalloc1(ontotalcols, &oilocal));
5603   dntotalcols = 0;
5604   ontotalcols = 0;
5605   ntotalcols  = 0;
5606   for (i = 0; i < nrows; i++) {
5607     owner = 0;
5608     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, NULL));
5609     /* Set iremote for diag matrix */
5610     for (j = 0; j < nlcols[i * 2 + 0]; j++) {
5611       iremote[dntotalcols].index = loffsets[i * 2 + 0] + j;
5612       iremote[dntotalcols].rank  = owner;
5613       /* P_oth is seqAIJ so that ilocal need to point to the first part of memory */
5614       ilocal[dntotalcols++] = ntotalcols++;
5615     }
5616     /* off-diagonal */
5617     for (j = 0; j < nlcols[i * 2 + 1]; j++) {
5618       oiremote[ontotalcols].index = loffsets[i * 2 + 1] + j;
5619       oiremote[ontotalcols].rank  = owner;
5620       oilocal[ontotalcols++]      = ntotalcols++;
5621     }
5622   }
5623   PetscCall(ISRestoreIndices(rows, &lrowindices));
5624   PetscCall(PetscFree(loffsets));
5625   PetscCall(PetscFree(nlcols));
5626   PetscCall(PetscSFCreate(comm, &sf));
5627   /* P serves as roots and P_oth is leaves
5628    * Diag matrix
5629    * */
5630   PetscCall(PetscSFSetGraph(sf, pd->i[plocalsize], dntotalcols, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5631   PetscCall(PetscSFSetFromOptions(sf));
5632   PetscCall(PetscSFSetUp(sf));
5633 
5634   PetscCall(PetscSFCreate(comm, &osf));
5635   /* off-diagonal */
5636   PetscCall(PetscSFSetGraph(osf, po->i[plocalsize], ontotalcols, oilocal, PETSC_OWN_POINTER, oiremote, PETSC_OWN_POINTER));
5637   PetscCall(PetscSFSetFromOptions(osf));
5638   PetscCall(PetscSFSetUp(osf));
5639   PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5640   PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5641   /* operate on the matrix internal data to save memory */
5642   PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5643   PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5644   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, NULL));
5645   /* Convert to global indices for diag matrix */
5646   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] += pcstart;
5647   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5648   /* We want P_oth store global indices */
5649   PetscCall(ISLocalToGlobalMappingCreate(comm, 1, p->B->cmap->n, p->garray, PETSC_COPY_VALUES, &mapping));
5650   /* Use memory scalable approach */
5651   PetscCall(ISLocalToGlobalMappingSetType(mapping, ISLOCALTOGLOBALMAPPINGHASH));
5652   PetscCall(ISLocalToGlobalMappingApply(mapping, po->i[plocalsize], po->j, po->j));
5653   PetscCall(PetscSFBcastBegin(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5654   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5655   /* Convert back to local indices */
5656   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] -= pcstart;
5657   PetscCall(PetscSFBcastEnd(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5658   nout = 0;
5659   PetscCall(ISGlobalToLocalMappingApply(mapping, IS_GTOLM_DROP, po->i[plocalsize], po->j, &nout, po->j));
5660   PetscCheck(nout == po->i[plocalsize], comm, PETSC_ERR_ARG_INCOMP, "n %" PetscInt_FMT " does not equal to nout %" PetscInt_FMT " ", po->i[plocalsize], nout);
5661   PetscCall(ISLocalToGlobalMappingDestroy(&mapping));
5662   /* Exchange values */
5663   PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5664   PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5665   PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5666   PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5667   /* Stop PETSc from shrinking memory */
5668   for (i = 0; i < nrows; i++) p_oth->ilen[i] = p_oth->imax[i];
5669   PetscCall(MatAssemblyBegin(*P_oth, MAT_FINAL_ASSEMBLY));
5670   PetscCall(MatAssemblyEnd(*P_oth, MAT_FINAL_ASSEMBLY));
5671   /* Attach PetscSF objects to P_oth so that we can reuse it later */
5672   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "diagsf", (PetscObject)sf));
5673   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "offdiagsf", (PetscObject)osf));
5674   PetscCall(PetscSFDestroy(&sf));
5675   PetscCall(PetscSFDestroy(&osf));
5676   PetscFunctionReturn(PETSC_SUCCESS);
5677 }
5678 
5679 /*
5680  * Creates a SeqAIJ matrix by taking rows of B that equal to nonzero columns of local A
5681  * This supports MPIAIJ and MAIJ
5682  * */
5683 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat A, Mat P, PetscInt dof, MatReuse reuse, Mat *P_oth)
5684 {
5685   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5686   Mat_SeqAIJ *p_oth;
5687   IS          rows, map;
5688   PetscHMapI  hamp;
5689   PetscInt    i, htsize, *rowindices, off, *mapping, key, count;
5690   MPI_Comm    comm;
5691   PetscSF     sf, osf;
5692   PetscBool   has;
5693 
5694   PetscFunctionBegin;
5695   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5696   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, P, 0, 0));
5697   /* If it is the first time, create an index set of off-diag nonzero columns of A,
5698    *  and then create a submatrix (that often is an overlapping matrix)
5699    * */
5700   if (reuse == MAT_INITIAL_MATRIX) {
5701     /* Use a hash table to figure out unique keys */
5702     PetscCall(PetscHMapICreateWithSize(a->B->cmap->n, &hamp));
5703     PetscCall(PetscCalloc1(a->B->cmap->n, &mapping));
5704     count = 0;
5705     /* Assume that  a->g is sorted, otherwise the following does not make sense */
5706     for (i = 0; i < a->B->cmap->n; i++) {
5707       key = a->garray[i] / dof;
5708       PetscCall(PetscHMapIHas(hamp, key, &has));
5709       if (!has) {
5710         mapping[i] = count;
5711         PetscCall(PetscHMapISet(hamp, key, count++));
5712       } else {
5713         /* Current 'i' has the same value the previous step */
5714         mapping[i] = count - 1;
5715       }
5716     }
5717     PetscCall(ISCreateGeneral(comm, a->B->cmap->n, mapping, PETSC_OWN_POINTER, &map));
5718     PetscCall(PetscHMapIGetSize(hamp, &htsize));
5719     PetscCheck(htsize == count, comm, PETSC_ERR_ARG_INCOMP, " Size of hash map %" PetscInt_FMT " is inconsistent with count %" PetscInt_FMT, htsize, count);
5720     PetscCall(PetscCalloc1(htsize, &rowindices));
5721     off = 0;
5722     PetscCall(PetscHMapIGetKeys(hamp, &off, rowindices));
5723     PetscCall(PetscHMapIDestroy(&hamp));
5724     PetscCall(PetscSortInt(htsize, rowindices));
5725     PetscCall(ISCreateGeneral(comm, htsize, rowindices, PETSC_OWN_POINTER, &rows));
5726     /* In case, the matrix was already created but users want to recreate the matrix */
5727     PetscCall(MatDestroy(P_oth));
5728     PetscCall(MatCreateSeqSubMatrixWithRows_Private(P, rows, P_oth));
5729     PetscCall(PetscObjectCompose((PetscObject)*P_oth, "aoffdiagtopothmapping", (PetscObject)map));
5730     PetscCall(ISDestroy(&map));
5731     PetscCall(ISDestroy(&rows));
5732   } else if (reuse == MAT_REUSE_MATRIX) {
5733     /* If matrix was already created, we simply update values using SF objects
5734      * that as attached to the matrix earlier.
5735      */
5736     const PetscScalar *pd_a, *po_a;
5737 
5738     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "diagsf", (PetscObject *)&sf));
5739     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "offdiagsf", (PetscObject *)&osf));
5740     PetscCheck(sf && osf, comm, PETSC_ERR_ARG_NULL, "Matrix is not initialized yet");
5741     p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5742     /* Update values in place */
5743     PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5744     PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5745     PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5746     PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5747     PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5748     PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5749     PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5750     PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5751   } else SETERRQ(comm, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unknown reuse type");
5752   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, P, 0, 0));
5753   PetscFunctionReturn(PETSC_SUCCESS);
5754 }
5755 
5756 /*@C
5757   MatGetBrowsOfAcols - Returns `IS` that contain rows of `B` that equal to nonzero columns of local `A`
5758 
5759   Collective
5760 
5761   Input Parameters:
5762 + A     - the first matrix in `MATMPIAIJ` format
5763 . B     - the second matrix in `MATMPIAIJ` format
5764 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5765 
5766   Output Parameters:
5767 + rowb  - On input index sets of rows of B to extract (or `NULL`), modified on output
5768 . colb  - On input index sets of columns of B to extract (or `NULL`), modified on output
5769 - B_seq - the sequential matrix generated
5770 
5771   Level: developer
5772 
5773 .seealso: `Mat`, `MATMPIAIJ`, `IS`, `MatReuse`
5774 @*/
5775 PetscErrorCode MatGetBrowsOfAcols(Mat A, Mat B, MatReuse scall, IS *rowb, IS *colb, Mat *B_seq)
5776 {
5777   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5778   PetscInt   *idx, i, start, ncols, nzA, nzB, *cmap, imark;
5779   IS          isrowb, iscolb;
5780   Mat        *bseq = NULL;
5781 
5782   PetscFunctionBegin;
5783   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5784              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5785   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAcols, A, B, 0, 0));
5786 
5787   if (scall == MAT_INITIAL_MATRIX) {
5788     start = A->cmap->rstart;
5789     cmap  = a->garray;
5790     nzA   = a->A->cmap->n;
5791     nzB   = a->B->cmap->n;
5792     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5793     ncols = 0;
5794     for (i = 0; i < nzB; i++) { /* row < local row index */
5795       if (cmap[i] < start) idx[ncols++] = cmap[i];
5796       else break;
5797     }
5798     imark = i;
5799     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;   /* local rows */
5800     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i]; /* row > local row index */
5801     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &isrowb));
5802     PetscCall(ISCreateStride(PETSC_COMM_SELF, B->cmap->N, 0, 1, &iscolb));
5803   } else {
5804     PetscCheck(rowb && colb, PETSC_COMM_SELF, PETSC_ERR_SUP, "IS rowb and colb must be provided for MAT_REUSE_MATRIX");
5805     isrowb = *rowb;
5806     iscolb = *colb;
5807     PetscCall(PetscMalloc1(1, &bseq));
5808     bseq[0] = *B_seq;
5809   }
5810   PetscCall(MatCreateSubMatrices(B, 1, &isrowb, &iscolb, scall, &bseq));
5811   *B_seq = bseq[0];
5812   PetscCall(PetscFree(bseq));
5813   if (!rowb) {
5814     PetscCall(ISDestroy(&isrowb));
5815   } else {
5816     *rowb = isrowb;
5817   }
5818   if (!colb) {
5819     PetscCall(ISDestroy(&iscolb));
5820   } else {
5821     *colb = iscolb;
5822   }
5823   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAcols, A, B, 0, 0));
5824   PetscFunctionReturn(PETSC_SUCCESS);
5825 }
5826 
5827 /*
5828     MatGetBrowsOfAoCols_MPIAIJ - Creates a `MATSEQAIJ` matrix by taking rows of B that equal to nonzero columns
5829     of the OFF-DIAGONAL portion of local A
5830 
5831     Collective
5832 
5833    Input Parameters:
5834 +    A,B - the matrices in `MATMPIAIJ` format
5835 -    scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5836 
5837    Output Parameter:
5838 +    startsj_s - starting point in B's sending j-arrays, saved for MAT_REUSE (or NULL)
5839 .    startsj_r - starting point in B's receiving j-arrays, saved for MAT_REUSE (or NULL)
5840 .    bufa_ptr - array for sending matrix values, saved for MAT_REUSE (or NULL)
5841 -    B_oth - the sequential matrix generated with size aBn=a->B->cmap->n by B->cmap->N
5842 
5843     Developer Note:
5844     This directly accesses information inside the VecScatter associated with the matrix-vector product
5845      for this matrix. This is not desirable..
5846 
5847     Level: developer
5848 
5849 */
5850 
5851 PetscErrorCode MatGetBrowsOfAoCols_MPIAIJ(Mat A, Mat B, MatReuse scall, PetscInt **startsj_s, PetscInt **startsj_r, MatScalar **bufa_ptr, Mat *B_oth)
5852 {
5853   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
5854   VecScatter         ctx;
5855   MPI_Comm           comm;
5856   const PetscMPIInt *rprocs, *sprocs;
5857   PetscMPIInt        nrecvs, nsends;
5858   const PetscInt    *srow, *rstarts, *sstarts;
5859   PetscInt          *rowlen, *bufj, *bufJ, ncols = 0, aBn = a->B->cmap->n, row, *b_othi, *b_othj, *rvalues = NULL, *svalues = NULL, *cols, sbs, rbs;
5860   PetscInt           i, j, k = 0, l, ll, nrows, *rstartsj = NULL, *sstartsj, len;
5861   PetscScalar       *b_otha, *bufa, *bufA, *vals = NULL;
5862   MPI_Request       *reqs = NULL, *rwaits = NULL, *swaits = NULL;
5863   PetscMPIInt        size, tag, rank, nreqs;
5864 
5865   PetscFunctionBegin;
5866   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5867   PetscCallMPI(MPI_Comm_size(comm, &size));
5868 
5869   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5870              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5871   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, B, 0, 0));
5872   PetscCallMPI(MPI_Comm_rank(comm, &rank));
5873 
5874   if (size == 1) {
5875     startsj_s = NULL;
5876     bufa_ptr  = NULL;
5877     *B_oth    = NULL;
5878     PetscFunctionReturn(PETSC_SUCCESS);
5879   }
5880 
5881   ctx = a->Mvctx;
5882   tag = ((PetscObject)ctx)->tag;
5883 
5884   PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &srow, &sprocs, &sbs));
5885   /* rprocs[] must be ordered so that indices received from them are ordered in rvalues[], which is key to algorithms used in this subroutine */
5886   PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL /*indices not needed*/, &rprocs, &rbs));
5887   PetscCall(PetscMPIIntCast(nsends + nrecvs, &nreqs));
5888   PetscCall(PetscMalloc1(nreqs, &reqs));
5889   rwaits = reqs;
5890   swaits = PetscSafePointerPlusOffset(reqs, nrecvs);
5891 
5892   if (!startsj_s || !bufa_ptr) scall = MAT_INITIAL_MATRIX;
5893   if (scall == MAT_INITIAL_MATRIX) {
5894     /* i-array */
5895     /*  post receives */
5896     if (nrecvs) PetscCall(PetscMalloc1(rbs * (rstarts[nrecvs] - rstarts[0]), &rvalues)); /* rstarts can be NULL when nrecvs=0 */
5897     for (i = 0; i < nrecvs; i++) {
5898       rowlen = rvalues + rstarts[i] * rbs;
5899       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of indices to be received */
5900       PetscCallMPI(MPIU_Irecv(rowlen, nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5901     }
5902 
5903     /* pack the outgoing message */
5904     PetscCall(PetscMalloc2(nsends + 1, &sstartsj, nrecvs + 1, &rstartsj));
5905 
5906     sstartsj[0] = 0;
5907     rstartsj[0] = 0;
5908     len         = 0; /* total length of j or a array to be sent */
5909     if (nsends) {
5910       k = sstarts[0]; /* ATTENTION: sstarts[0] and rstarts[0] are not necessarily zero */
5911       PetscCall(PetscMalloc1(sbs * (sstarts[nsends] - sstarts[0]), &svalues));
5912     }
5913     for (i = 0; i < nsends; i++) {
5914       rowlen = svalues + (sstarts[i] - sstarts[0]) * sbs;
5915       nrows  = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5916       for (j = 0; j < nrows; j++) {
5917         row = srow[k] + B->rmap->range[rank]; /* global row idx */
5918         for (l = 0; l < sbs; l++) {
5919           PetscCall(MatGetRow_MPIAIJ(B, row + l, &ncols, NULL, NULL)); /* rowlength */
5920 
5921           rowlen[j * sbs + l] = ncols;
5922 
5923           len += ncols;
5924           PetscCall(MatRestoreRow_MPIAIJ(B, row + l, &ncols, NULL, NULL));
5925         }
5926         k++;
5927       }
5928       PetscCallMPI(MPIU_Isend(rowlen, nrows * sbs, MPIU_INT, sprocs[i], tag, comm, swaits + i));
5929 
5930       sstartsj[i + 1] = len; /* starting point of (i+1)-th outgoing msg in bufj and bufa */
5931     }
5932     /* recvs and sends of i-array are completed */
5933     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5934     PetscCall(PetscFree(svalues));
5935 
5936     /* allocate buffers for sending j and a arrays */
5937     PetscCall(PetscMalloc1(len + 1, &bufj));
5938     PetscCall(PetscMalloc1(len + 1, &bufa));
5939 
5940     /* create i-array of B_oth */
5941     PetscCall(PetscMalloc1(aBn + 2, &b_othi));
5942 
5943     b_othi[0] = 0;
5944     len       = 0; /* total length of j or a array to be received */
5945     k         = 0;
5946     for (i = 0; i < nrecvs; i++) {
5947       rowlen = rvalues + (rstarts[i] - rstarts[0]) * rbs;
5948       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of rows to be received */
5949       for (j = 0; j < nrows; j++) {
5950         b_othi[k + 1] = b_othi[k] + rowlen[j];
5951         PetscCall(PetscIntSumError(rowlen[j], len, &len));
5952         k++;
5953       }
5954       rstartsj[i + 1] = len; /* starting point of (i+1)-th incoming msg in bufj and bufa */
5955     }
5956     PetscCall(PetscFree(rvalues));
5957 
5958     /* allocate space for j and a arrays of B_oth */
5959     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_othj));
5960     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_otha));
5961 
5962     /* j-array */
5963     /*  post receives of j-array */
5964     for (i = 0; i < nrecvs; i++) {
5965       nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5966       PetscCallMPI(MPIU_Irecv(b_othj + rstartsj[i], nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5967     }
5968 
5969     /* pack the outgoing message j-array */
5970     if (nsends) k = sstarts[0];
5971     for (i = 0; i < nsends; i++) {
5972       nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5973       bufJ  = bufj + sstartsj[i];
5974       for (j = 0; j < nrows; j++) {
5975         row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5976         for (ll = 0; ll < sbs; ll++) {
5977           PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5978           for (l = 0; l < ncols; l++) *bufJ++ = cols[l];
5979           PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5980         }
5981       }
5982       PetscCallMPI(MPIU_Isend(bufj + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_INT, sprocs[i], tag, comm, swaits + i));
5983     }
5984 
5985     /* recvs and sends of j-array are completed */
5986     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5987   } else if (scall == MAT_REUSE_MATRIX) {
5988     sstartsj = *startsj_s;
5989     rstartsj = *startsj_r;
5990     bufa     = *bufa_ptr;
5991     PetscCall(MatSeqAIJGetArrayWrite(*B_oth, &b_otha));
5992   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not possess an object container");
5993 
5994   /* a-array */
5995   /*  post receives of a-array */
5996   for (i = 0; i < nrecvs; i++) {
5997     nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5998     PetscCallMPI(MPIU_Irecv(b_otha + rstartsj[i], nrows, MPIU_SCALAR, rprocs[i], tag, comm, rwaits + i));
5999   }
6000 
6001   /* pack the outgoing message a-array */
6002   if (nsends) k = sstarts[0];
6003   for (i = 0; i < nsends; i++) {
6004     nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
6005     bufA  = bufa + sstartsj[i];
6006     for (j = 0; j < nrows; j++) {
6007       row = srow[k++] + B->rmap->range[rank]; /* global row idx */
6008       for (ll = 0; ll < sbs; ll++) {
6009         PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
6010         for (l = 0; l < ncols; l++) *bufA++ = vals[l];
6011         PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
6012       }
6013     }
6014     PetscCallMPI(MPIU_Isend(bufa + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_SCALAR, sprocs[i], tag, comm, swaits + i));
6015   }
6016   /* recvs and sends of a-array are completed */
6017   if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
6018   PetscCall(PetscFree(reqs));
6019 
6020   if (scall == MAT_INITIAL_MATRIX) {
6021     Mat_SeqAIJ *b_oth;
6022 
6023     /* put together the new matrix */
6024     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, aBn, B->cmap->N, b_othi, b_othj, b_otha, B_oth));
6025 
6026     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6027     /* Since these are PETSc arrays, change flags to free them as necessary. */
6028     b_oth          = (Mat_SeqAIJ *)(*B_oth)->data;
6029     b_oth->free_a  = PETSC_TRUE;
6030     b_oth->free_ij = PETSC_TRUE;
6031     b_oth->nonew   = 0;
6032 
6033     PetscCall(PetscFree(bufj));
6034     if (!startsj_s || !bufa_ptr) {
6035       PetscCall(PetscFree2(sstartsj, rstartsj));
6036       PetscCall(PetscFree(bufa_ptr));
6037     } else {
6038       *startsj_s = sstartsj;
6039       *startsj_r = rstartsj;
6040       *bufa_ptr  = bufa;
6041     }
6042   } else if (scall == MAT_REUSE_MATRIX) {
6043     PetscCall(MatSeqAIJRestoreArrayWrite(*B_oth, &b_otha));
6044   }
6045 
6046   PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE, &nsends, &sstarts, &srow, &sprocs, &sbs));
6047   PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE, &nrecvs, &rstarts, NULL, &rprocs, &rbs));
6048   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, B, 0, 0));
6049   PetscFunctionReturn(PETSC_SUCCESS);
6050 }
6051 
6052 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCRL(Mat, MatType, MatReuse, Mat *);
6053 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJPERM(Mat, MatType, MatReuse, Mat *);
6054 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJSELL(Mat, MatType, MatReuse, Mat *);
6055 #if defined(PETSC_HAVE_MKL_SPARSE)
6056 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJMKL(Mat, MatType, MatReuse, Mat *);
6057 #endif
6058 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIBAIJ(Mat, MatType, MatReuse, Mat *);
6059 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISBAIJ(Mat, MatType, MatReuse, Mat *);
6060 #if defined(PETSC_HAVE_ELEMENTAL)
6061 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_Elemental(Mat, MatType, MatReuse, Mat *);
6062 #endif
6063 #if defined(PETSC_HAVE_SCALAPACK)
6064 PETSC_INTERN PetscErrorCode MatConvert_AIJ_ScaLAPACK(Mat, MatType, MatReuse, Mat *);
6065 #endif
6066 #if defined(PETSC_HAVE_HYPRE)
6067 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
6068 #endif
6069 #if defined(PETSC_HAVE_CUDA)
6070 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
6071 #endif
6072 #if defined(PETSC_HAVE_HIP)
6073 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
6074 #endif
6075 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6076 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat, MatType, MatReuse, Mat *);
6077 #endif
6078 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISELL(Mat, MatType, MatReuse, Mat *);
6079 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
6080 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_IS_XAIJ(Mat);
6081 
6082 /*
6083     Computes (B'*A')' since computing B*A directly is untenable
6084 
6085                n                       p                          p
6086         [             ]       [             ]         [                 ]
6087       m [      A      ]  *  n [       B     ]   =   m [         C       ]
6088         [             ]       [             ]         [                 ]
6089 
6090 */
6091 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIAIJ(Mat A, Mat B, Mat C)
6092 {
6093   Mat At, Bt, Ct;
6094 
6095   PetscFunctionBegin;
6096   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
6097   PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &Bt));
6098   PetscCall(MatMatMult(Bt, At, MAT_INITIAL_MATRIX, PETSC_CURRENT, &Ct));
6099   PetscCall(MatDestroy(&At));
6100   PetscCall(MatDestroy(&Bt));
6101   PetscCall(MatTransposeSetPrecursor(Ct, C));
6102   PetscCall(MatTranspose(Ct, MAT_REUSE_MATRIX, &C));
6103   PetscCall(MatDestroy(&Ct));
6104   PetscFunctionReturn(PETSC_SUCCESS);
6105 }
6106 
6107 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIAIJ(Mat A, Mat B, PetscReal fill, Mat C)
6108 {
6109   PetscBool cisdense;
6110 
6111   PetscFunctionBegin;
6112   PetscCheck(A->cmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "A->cmap->n %" PetscInt_FMT " != B->rmap->n %" PetscInt_FMT, A->cmap->n, B->rmap->n);
6113   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
6114   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6115   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, MATMPIDENSEHIP, ""));
6116   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
6117   PetscCall(MatSetUp(C));
6118 
6119   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIAIJ;
6120   PetscFunctionReturn(PETSC_SUCCESS);
6121 }
6122 
6123 static PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ_AB(Mat C)
6124 {
6125   Mat_Product *product = C->product;
6126   Mat          A = product->A, B = product->B;
6127 
6128   PetscFunctionBegin;
6129   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
6130              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
6131   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIAIJ;
6132   C->ops->productsymbolic = MatProductSymbolic_AB;
6133   PetscFunctionReturn(PETSC_SUCCESS);
6134 }
6135 
6136 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ(Mat C)
6137 {
6138   Mat_Product *product = C->product;
6139 
6140   PetscFunctionBegin;
6141   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_MPIDense_MPIAIJ_AB(C));
6142   PetscFunctionReturn(PETSC_SUCCESS);
6143 }
6144 
6145 /*
6146    Merge two sets of sorted nonzeros and return a CSR for the merged (sequential) matrix
6147 
6148   Input Parameters:
6149 
6150     j1,rowBegin1,rowEnd1,jmap1: describe the first set of nonzeros (Set1)
6151     j2,rowBegin2,rowEnd2,jmap2: describe the second set of nonzeros (Set2)
6152 
6153     mat: both sets' nonzeros are on m rows, where m is the number of local rows of the matrix mat
6154 
6155     For Set1, j1[] contains column indices of the nonzeros.
6156     For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k
6157     respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted,
6158     but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1.
6159 
6160     Similar for Set2.
6161 
6162     This routine merges the two sets of nonzeros row by row and removes repeats.
6163 
6164   Output Parameters: (memory is allocated by the caller)
6165 
6166     i[],j[]: the CSR of the merged matrix, which has m rows.
6167     imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix.
6168     imap2[]: similar to imap1[], but for Set2.
6169     Note we order nonzeros row-by-row and from left to right.
6170 */
6171 static PetscErrorCode MatMergeEntries_Internal(Mat mat, const PetscInt j1[], const PetscInt j2[], const PetscCount rowBegin1[], const PetscCount rowEnd1[], const PetscCount rowBegin2[], const PetscCount rowEnd2[], const PetscCount jmap1[], const PetscCount jmap2[], PetscCount imap1[], PetscCount imap2[], PetscInt i[], PetscInt j[])
6172 {
6173   PetscInt   r, m; /* Row index of mat */
6174   PetscCount t, t1, t2, b1, e1, b2, e2;
6175 
6176   PetscFunctionBegin;
6177   PetscCall(MatGetLocalSize(mat, &m, NULL));
6178   t1 = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */
6179   i[0]        = 0;
6180   for (r = 0; r < m; r++) { /* Do row by row merging */
6181     b1 = rowBegin1[r];
6182     e1 = rowEnd1[r];
6183     b2 = rowBegin2[r];
6184     e2 = rowEnd2[r];
6185     while (b1 < e1 && b2 < e2) {
6186       if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */
6187         j[t]      = j1[b1];
6188         imap1[t1] = t;
6189         imap2[t2] = t;
6190         b1 += jmap1[t1 + 1] - jmap1[t1]; /* Jump to next unique local nonzero */
6191         b2 += jmap2[t2 + 1] - jmap2[t2]; /* Jump to next unique remote nonzero */
6192         t1++;
6193         t2++;
6194         t++;
6195       } else if (j1[b1] < j2[b2]) {
6196         j[t]      = j1[b1];
6197         imap1[t1] = t;
6198         b1 += jmap1[t1 + 1] - jmap1[t1];
6199         t1++;
6200         t++;
6201       } else {
6202         j[t]      = j2[b2];
6203         imap2[t2] = t;
6204         b2 += jmap2[t2 + 1] - jmap2[t2];
6205         t2++;
6206         t++;
6207       }
6208     }
6209     /* Merge the remaining in either j1[] or j2[] */
6210     while (b1 < e1) {
6211       j[t]      = j1[b1];
6212       imap1[t1] = t;
6213       b1 += jmap1[t1 + 1] - jmap1[t1];
6214       t1++;
6215       t++;
6216     }
6217     while (b2 < e2) {
6218       j[t]      = j2[b2];
6219       imap2[t2] = t;
6220       b2 += jmap2[t2 + 1] - jmap2[t2];
6221       t2++;
6222       t++;
6223     }
6224     PetscCall(PetscIntCast(t, i + r + 1));
6225   }
6226   PetscFunctionReturn(PETSC_SUCCESS);
6227 }
6228 
6229 /*
6230   Split nonzeros in a block of local rows into two subsets: those in the diagonal block and those in the off-diagonal block
6231 
6232   Input Parameters:
6233     mat: an MPI matrix that provides row and column layout information for splitting. Let's say its number of local rows is m.
6234     n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[]
6235       respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n.
6236 
6237       i[] is already sorted, but within a row, j[] is not sorted and might have repeats.
6238       i[] might contain negative indices at the beginning, which means the corresponding entries should be ignored in the splitting.
6239 
6240   Output Parameters:
6241     j[],perm[]: the routine needs to sort j[] within each row along with perm[].
6242     rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller.
6243       They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries of row r of the diagonal block,
6244       and [rowMid[r],rowEnd[r]) point to begin/end entries of row r of the off-diagonal block.
6245 
6246     Aperm[],Ajmap[],Atot,Annz: Arrays are allocated by this routine.
6247       Atot: number of entries belonging to the diagonal block.
6248       Annz: number of unique nonzeros belonging to the diagonal block.
6249       Aperm[Atot] stores values from perm[] for entries belonging to the diagonal block. Length of Aperm[] is Atot, though it may also count
6250         repeats (i.e., same 'i,j' pair).
6251       Ajmap[Annz+1] stores the number of repeats of each unique entry belonging to the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t]
6252         is the number of repeats for the t-th unique entry in the diagonal block. Ajmap[0] is always 0.
6253 
6254       Atot: number of entries belonging to the diagonal block
6255       Annz: number of unique nonzeros belonging to the diagonal block.
6256 
6257     Bperm[], Bjmap[], Btot, Bnnz are similar but for the off-diagonal block.
6258 
6259     Aperm[],Bperm[],Ajmap[] and Bjmap[] are allocated separately by this routine with PetscMalloc1().
6260 */
6261 static PetscErrorCode MatSplitEntries_Internal(Mat mat, PetscCount n, const PetscInt i[], PetscInt j[], PetscCount perm[], PetscCount rowBegin[], PetscCount rowMid[], PetscCount rowEnd[], PetscCount *Atot_, PetscCount **Aperm_, PetscCount *Annz_, PetscCount **Ajmap_, PetscCount *Btot_, PetscCount **Bperm_, PetscCount *Bnnz_, PetscCount **Bjmap_)
6262 {
6263   PetscInt    cstart, cend, rstart, rend, row, col;
6264   PetscCount  Atot = 0, Btot = 0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */
6265   PetscCount  Annz = 0, Bnnz = 0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */
6266   PetscCount  k, m, p, q, r, s, mid;
6267   PetscCount *Aperm, *Bperm, *Ajmap, *Bjmap;
6268 
6269   PetscFunctionBegin;
6270   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6271   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6272   m = rend - rstart;
6273 
6274   /* Skip negative rows */
6275   for (k = 0; k < n; k++)
6276     if (i[k] >= 0) break;
6277 
6278   /* Process [k,n): sort and partition each local row into diag and offdiag portions,
6279      fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz.
6280   */
6281   while (k < n) {
6282     row = i[k];
6283     /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */
6284     for (s = k; s < n; s++)
6285       if (i[s] != row) break;
6286 
6287     /* Shift diag columns to range of [-PETSC_INT_MAX, -1] */
6288     for (p = k; p < s; p++) {
6289       if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_INT_MAX;
6290       else PetscAssert((j[p] >= 0) && (j[p] <= mat->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index %" PetscInt_FMT " is out of range", j[p]);
6291     }
6292     PetscCall(PetscSortIntWithCountArray(s - k, j + k, perm + k));
6293     PetscCall(PetscSortedIntUpperBound(j, k, s, -1, &mid)); /* Separate [k,s) into [k,mid) for diag and [mid,s) for offdiag */
6294     rowBegin[row - rstart] = k;
6295     rowMid[row - rstart]   = mid;
6296     rowEnd[row - rstart]   = s;
6297 
6298     /* Count nonzeros of this diag/offdiag row, which might have repeats */
6299     Atot += mid - k;
6300     Btot += s - mid;
6301 
6302     /* Count unique nonzeros of this diag row */
6303     for (p = k; p < mid;) {
6304       col = j[p];
6305       do {
6306         j[p] += PETSC_INT_MAX; /* Revert the modified diagonal indices */
6307         p++;
6308       } while (p < mid && j[p] == col);
6309       Annz++;
6310     }
6311 
6312     /* Count unique nonzeros of this offdiag row */
6313     for (p = mid; p < s;) {
6314       col = j[p];
6315       do {
6316         p++;
6317       } while (p < s && j[p] == col);
6318       Bnnz++;
6319     }
6320     k = s;
6321   }
6322 
6323   /* Allocation according to Atot, Btot, Annz, Bnnz */
6324   PetscCall(PetscMalloc1(Atot, &Aperm));
6325   PetscCall(PetscMalloc1(Btot, &Bperm));
6326   PetscCall(PetscMalloc1(Annz + 1, &Ajmap));
6327   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap));
6328 
6329   /* Re-scan indices and copy diag/offdiag permutation indices to Aperm, Bperm and also fill Ajmap and Bjmap */
6330   Ajmap[0] = Bjmap[0] = Atot = Btot = Annz = Bnnz = 0;
6331   for (r = 0; r < m; r++) {
6332     k   = rowBegin[r];
6333     mid = rowMid[r];
6334     s   = rowEnd[r];
6335     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Aperm, Atot), PetscSafePointerPlusOffset(perm, k), mid - k));
6336     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Bperm, Btot), PetscSafePointerPlusOffset(perm, mid), s - mid));
6337     Atot += mid - k;
6338     Btot += s - mid;
6339 
6340     /* Scan column indices in this row and find out how many repeats each unique nonzero has */
6341     for (p = k; p < mid;) {
6342       col = j[p];
6343       q   = p;
6344       do {
6345         p++;
6346       } while (p < mid && j[p] == col);
6347       Ajmap[Annz + 1] = Ajmap[Annz] + (p - q);
6348       Annz++;
6349     }
6350 
6351     for (p = mid; p < s;) {
6352       col = j[p];
6353       q   = p;
6354       do {
6355         p++;
6356       } while (p < s && j[p] == col);
6357       Bjmap[Bnnz + 1] = Bjmap[Bnnz] + (p - q);
6358       Bnnz++;
6359     }
6360   }
6361   /* Output */
6362   *Aperm_ = Aperm;
6363   *Annz_  = Annz;
6364   *Atot_  = Atot;
6365   *Ajmap_ = Ajmap;
6366   *Bperm_ = Bperm;
6367   *Bnnz_  = Bnnz;
6368   *Btot_  = Btot;
6369   *Bjmap_ = Bjmap;
6370   PetscFunctionReturn(PETSC_SUCCESS);
6371 }
6372 
6373 /*
6374   Expand the jmap[] array to make a new one in view of nonzeros in the merged matrix
6375 
6376   Input Parameters:
6377     nnz1: number of unique nonzeros in a set that was used to produce imap[], jmap[]
6378     nnz:  number of unique nonzeros in the merged matrix
6379     imap[nnz1]: i-th nonzero in the set is the imap[i]-th nonzero in the merged matrix
6380     jmap[nnz1+1]: i-th nonzero in the set has jmap[i+1] - jmap[i] repeats in the set
6381 
6382   Output Parameter: (memory is allocated by the caller)
6383     jmap_new[nnz+1]: i-th nonzero in the merged matrix has jmap_new[i+1] - jmap_new[i] repeats in the set
6384 
6385   Example:
6386     nnz1 = 4
6387     nnz  = 6
6388     imap = [1,3,4,5]
6389     jmap = [0,3,5,6,7]
6390    then,
6391     jmap_new = [0,0,3,3,5,6,7]
6392 */
6393 static PetscErrorCode ExpandJmap_Internal(PetscCount nnz1, PetscCount nnz, const PetscCount imap[], const PetscCount jmap[], PetscCount jmap_new[])
6394 {
6395   PetscCount k, p;
6396 
6397   PetscFunctionBegin;
6398   jmap_new[0] = 0;
6399   p           = nnz;                /* p loops over jmap_new[] backwards */
6400   for (k = nnz1 - 1; k >= 0; k--) { /* k loops over imap[] */
6401     for (; p > imap[k]; p--) jmap_new[p] = jmap[k + 1];
6402   }
6403   for (; p >= 0; p--) jmap_new[p] = jmap[0];
6404   PetscFunctionReturn(PETSC_SUCCESS);
6405 }
6406 
6407 static PetscErrorCode MatCOOStructDestroy_MPIAIJ(void **data)
6408 {
6409   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;
6410 
6411   PetscFunctionBegin;
6412   PetscCall(PetscSFDestroy(&coo->sf));
6413   PetscCall(PetscFree(coo->Aperm1));
6414   PetscCall(PetscFree(coo->Bperm1));
6415   PetscCall(PetscFree(coo->Ajmap1));
6416   PetscCall(PetscFree(coo->Bjmap1));
6417   PetscCall(PetscFree(coo->Aimap2));
6418   PetscCall(PetscFree(coo->Bimap2));
6419   PetscCall(PetscFree(coo->Aperm2));
6420   PetscCall(PetscFree(coo->Bperm2));
6421   PetscCall(PetscFree(coo->Ajmap2));
6422   PetscCall(PetscFree(coo->Bjmap2));
6423   PetscCall(PetscFree(coo->Cperm1));
6424   PetscCall(PetscFree2(coo->sendbuf, coo->recvbuf));
6425   PetscCall(PetscFree(coo));
6426   PetscFunctionReturn(PETSC_SUCCESS);
6427 }
6428 
6429 PetscErrorCode MatSetPreallocationCOO_MPIAIJ(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
6430 {
6431   MPI_Comm             comm;
6432   PetscMPIInt          rank, size;
6433   PetscInt             m, n, M, N, rstart, rend, cstart, cend; /* Sizes, indices of row/col, therefore with type PetscInt */
6434   PetscCount           k, p, q, rem;                           /* Loop variables over coo arrays */
6435   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6436   PetscContainer       container;
6437   MatCOOStruct_MPIAIJ *coo;
6438 
6439   PetscFunctionBegin;
6440   PetscCall(PetscFree(mpiaij->garray));
6441   PetscCall(VecDestroy(&mpiaij->lvec));
6442 #if defined(PETSC_USE_CTABLE)
6443   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
6444 #else
6445   PetscCall(PetscFree(mpiaij->colmap));
6446 #endif
6447   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
6448   mat->assembled     = PETSC_FALSE;
6449   mat->was_assembled = PETSC_FALSE;
6450 
6451   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
6452   PetscCallMPI(MPI_Comm_size(comm, &size));
6453   PetscCallMPI(MPI_Comm_rank(comm, &rank));
6454   PetscCall(PetscLayoutSetUp(mat->rmap));
6455   PetscCall(PetscLayoutSetUp(mat->cmap));
6456   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6457   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6458   PetscCall(MatGetLocalSize(mat, &m, &n));
6459   PetscCall(MatGetSize(mat, &M, &N));
6460 
6461   /* Sort (i,j) by row along with a permutation array, so that the to-be-ignored */
6462   /* entries come first, then local rows, then remote rows.                     */
6463   PetscCount n1 = coo_n, *perm1;
6464   PetscInt  *i1 = coo_i, *j1 = coo_j;
6465 
6466   PetscCall(PetscMalloc1(n1, &perm1));
6467   for (k = 0; k < n1; k++) perm1[k] = k;
6468 
6469   /* Manipulate indices so that entries with negative row or col indices will have smallest
6470      row indices, local entries will have greater but negative row indices, and remote entries
6471      will have positive row indices.
6472   */
6473   for (k = 0; k < n1; k++) {
6474     if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_INT_MIN;                /* e.g., -2^31, minimal to move them ahead */
6475     else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_INT_MAX; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_INT_MAX, -1] */
6476     else {
6477       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "MAT_NO_OFF_PROC_ENTRIES is set but insert to remote rows");
6478       if (mpiaij->donotstash) i1[k] = PETSC_INT_MIN; /* Ignore offproc entries as if they had negative indices */
6479     }
6480   }
6481 
6482   /* Sort by row; after that, [0,k) have ignored entries, [k,rem) have local rows and [rem,n1) have remote rows */
6483   PetscCall(PetscSortIntWithIntCountArrayPair(n1, i1, j1, perm1));
6484 
6485   /* Advance k to the first entry we need to take care of */
6486   for (k = 0; k < n1; k++)
6487     if (i1[k] > PETSC_INT_MIN) break;
6488   PetscCount i1start = k;
6489 
6490   PetscCall(PetscSortedIntUpperBound(i1, k, n1, rend - 1 - PETSC_INT_MAX, &rem)); /* rem is upper bound of the last local row */
6491   for (; k < rem; k++) i1[k] += PETSC_INT_MAX;                                    /* Revert row indices of local rows*/
6492 
6493   /*           Send remote rows to their owner                                  */
6494   /* Find which rows should be sent to which remote ranks*/
6495   PetscInt        nsend = 0; /* Number of MPI ranks to send data to */
6496   PetscMPIInt    *sendto;    /* [nsend], storing remote ranks */
6497   PetscInt       *nentries;  /* [nsend], storing number of entries sent to remote ranks; Assume PetscInt is big enough for this count, and error if not */
6498   const PetscInt *ranges;
6499   PetscInt        maxNsend = size >= 128 ? 128 : size; /* Assume max 128 neighbors; realloc when needed */
6500 
6501   PetscCall(PetscLayoutGetRanges(mat->rmap, &ranges));
6502   PetscCall(PetscMalloc2(maxNsend, &sendto, maxNsend, &nentries));
6503   for (k = rem; k < n1;) {
6504     PetscMPIInt owner;
6505     PetscInt    firstRow, lastRow;
6506 
6507     /* Locate a row range */
6508     firstRow = i1[k]; /* first row of this owner */
6509     PetscCall(PetscLayoutFindOwner(mat->rmap, firstRow, &owner));
6510     lastRow = ranges[owner + 1] - 1; /* last row of this owner */
6511 
6512     /* Find the first index 'p' in [k,n) with i[p] belonging to next owner */
6513     PetscCall(PetscSortedIntUpperBound(i1, k, n1, lastRow, &p));
6514 
6515     /* All entries in [k,p) belong to this remote owner */
6516     if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */
6517       PetscMPIInt *sendto2;
6518       PetscInt    *nentries2;
6519       PetscInt     maxNsend2 = (maxNsend <= size / 2) ? maxNsend * 2 : size;
6520 
6521       PetscCall(PetscMalloc2(maxNsend2, &sendto2, maxNsend2, &nentries2));
6522       PetscCall(PetscArraycpy(sendto2, sendto, maxNsend));
6523       PetscCall(PetscArraycpy(nentries2, nentries2, maxNsend + 1));
6524       PetscCall(PetscFree2(sendto, nentries2));
6525       sendto   = sendto2;
6526       nentries = nentries2;
6527       maxNsend = maxNsend2;
6528     }
6529     sendto[nsend] = owner;
6530     PetscCall(PetscIntCast(p - k, &nentries[nsend]));
6531     nsend++;
6532     k = p;
6533   }
6534 
6535   /* Build 1st SF to know offsets on remote to send data */
6536   PetscSF      sf1;
6537   PetscInt     nroots = 1, nroots2 = 0;
6538   PetscInt     nleaves = nsend, nleaves2 = 0;
6539   PetscInt    *offsets;
6540   PetscSFNode *iremote;
6541 
6542   PetscCall(PetscSFCreate(comm, &sf1));
6543   PetscCall(PetscMalloc1(nsend, &iremote));
6544   PetscCall(PetscMalloc1(nsend, &offsets));
6545   for (k = 0; k < nsend; k++) {
6546     iremote[k].rank  = sendto[k];
6547     iremote[k].index = 0;
6548     nleaves2 += nentries[k];
6549     PetscCheck(nleaves2 >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF leaves is too large for PetscInt");
6550   }
6551   PetscCall(PetscSFSetGraph(sf1, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6552   PetscCall(PetscSFFetchAndOpWithMemTypeBegin(sf1, MPIU_INT, PETSC_MEMTYPE_HOST, &nroots2 /*rootdata*/, PETSC_MEMTYPE_HOST, nentries /*leafdata*/, PETSC_MEMTYPE_HOST, offsets /*leafupdate*/, MPI_SUM));
6553   PetscCall(PetscSFFetchAndOpEnd(sf1, MPIU_INT, &nroots2, nentries, offsets, MPI_SUM)); /* Would nroots2 overflow, we check offsets[] below */
6554   PetscCall(PetscSFDestroy(&sf1));
6555   PetscAssert(nleaves2 == n1 - rem, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nleaves2 %" PetscInt_FMT " != number of remote entries %" PetscCount_FMT, nleaves2, n1 - rem);
6556 
6557   /* Build 2nd SF to send remote COOs to their owner */
6558   PetscSF sf2;
6559   nroots  = nroots2;
6560   nleaves = nleaves2;
6561   PetscCall(PetscSFCreate(comm, &sf2));
6562   PetscCall(PetscSFSetFromOptions(sf2));
6563   PetscCall(PetscMalloc1(nleaves, &iremote));
6564   p = 0;
6565   for (k = 0; k < nsend; k++) {
6566     PetscCheck(offsets[k] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF roots is too large for PetscInt");
6567     for (q = 0; q < nentries[k]; q++, p++) {
6568       iremote[p].rank = sendto[k];
6569       PetscCall(PetscIntCast(offsets[k] + q, &iremote[p].index));
6570     }
6571   }
6572   PetscCall(PetscSFSetGraph(sf2, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6573 
6574   /* Send the remote COOs to their owner */
6575   PetscInt    n2 = nroots, *i2, *j2; /* Buffers for received COOs from other ranks, along with a permutation array */
6576   PetscCount *perm2;                 /* Though PetscInt is enough for remote entries, we use PetscCount here as we want to reuse MatSplitEntries_Internal() */
6577   PetscCall(PetscMalloc3(n2, &i2, n2, &j2, n2, &perm2));
6578   PetscAssert(rem == 0 || i1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6579   PetscAssert(rem == 0 || j1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6580   PetscInt *i1prem = PetscSafePointerPlusOffset(i1, rem);
6581   PetscInt *j1prem = PetscSafePointerPlusOffset(j1, rem);
6582   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, i1prem, PETSC_MEMTYPE_HOST, i2, MPI_REPLACE));
6583   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, i1prem, i2, MPI_REPLACE));
6584   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, j1prem, PETSC_MEMTYPE_HOST, j2, MPI_REPLACE));
6585   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, j1prem, j2, MPI_REPLACE));
6586 
6587   PetscCall(PetscFree(offsets));
6588   PetscCall(PetscFree2(sendto, nentries));
6589 
6590   /* Sort received COOs by row along with the permutation array     */
6591   for (k = 0; k < n2; k++) perm2[k] = k;
6592   PetscCall(PetscSortIntWithIntCountArrayPair(n2, i2, j2, perm2));
6593 
6594   /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permutation which will be used to fill leafdata */
6595   PetscCount *Cperm1;
6596   PetscAssert(rem == 0 || perm1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6597   PetscCount *perm1prem = PetscSafePointerPlusOffset(perm1, rem);
6598   PetscCall(PetscMalloc1(nleaves, &Cperm1));
6599   PetscCall(PetscArraycpy(Cperm1, perm1prem, nleaves));
6600 
6601   /* Support for HYPRE matrices, kind of a hack.
6602      Swap min column with diagonal so that diagonal values will go first */
6603   PetscBool hypre;
6604   PetscCall(PetscStrcmp("_internal_COO_mat_for_hypre", ((PetscObject)mat)->name, &hypre));
6605   if (hypre) {
6606     PetscInt *minj;
6607     PetscBT   hasdiag;
6608 
6609     PetscCall(PetscBTCreate(m, &hasdiag));
6610     PetscCall(PetscMalloc1(m, &minj));
6611     for (k = 0; k < m; k++) minj[k] = PETSC_INT_MAX;
6612     for (k = i1start; k < rem; k++) {
6613       if (j1[k] < cstart || j1[k] >= cend) continue;
6614       const PetscInt rindex = i1[k] - rstart;
6615       if ((j1[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6616       minj[rindex] = PetscMin(minj[rindex], j1[k]);
6617     }
6618     for (k = 0; k < n2; k++) {
6619       if (j2[k] < cstart || j2[k] >= cend) continue;
6620       const PetscInt rindex = i2[k] - rstart;
6621       if ((j2[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6622       minj[rindex] = PetscMin(minj[rindex], j2[k]);
6623     }
6624     for (k = i1start; k < rem; k++) {
6625       const PetscInt rindex = i1[k] - rstart;
6626       if (j1[k] < cstart || j1[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6627       if (j1[k] == minj[rindex]) j1[k] = i1[k] + (cstart - rstart);
6628       else if ((j1[k] - cstart) == rindex) j1[k] = minj[rindex];
6629     }
6630     for (k = 0; k < n2; k++) {
6631       const PetscInt rindex = i2[k] - rstart;
6632       if (j2[k] < cstart || j2[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6633       if (j2[k] == minj[rindex]) j2[k] = i2[k] + (cstart - rstart);
6634       else if ((j2[k] - cstart) == rindex) j2[k] = minj[rindex];
6635     }
6636     PetscCall(PetscBTDestroy(&hasdiag));
6637     PetscCall(PetscFree(minj));
6638   }
6639 
6640   /* Split local COOs and received COOs into diag/offdiag portions */
6641   PetscCount *rowBegin1, *rowMid1, *rowEnd1;
6642   PetscCount *Ajmap1, *Aperm1, *Bjmap1, *Bperm1;
6643   PetscCount  Annz1, Bnnz1, Atot1, Btot1;
6644   PetscCount *rowBegin2, *rowMid2, *rowEnd2;
6645   PetscCount *Ajmap2, *Aperm2, *Bjmap2, *Bperm2;
6646   PetscCount  Annz2, Bnnz2, Atot2, Btot2;
6647 
6648   PetscCall(PetscCalloc3(m, &rowBegin1, m, &rowMid1, m, &rowEnd1));
6649   PetscCall(PetscCalloc3(m, &rowBegin2, m, &rowMid2, m, &rowEnd2));
6650   PetscCall(MatSplitEntries_Internal(mat, rem, i1, j1, perm1, rowBegin1, rowMid1, rowEnd1, &Atot1, &Aperm1, &Annz1, &Ajmap1, &Btot1, &Bperm1, &Bnnz1, &Bjmap1));
6651   PetscCall(MatSplitEntries_Internal(mat, n2, i2, j2, perm2, rowBegin2, rowMid2, rowEnd2, &Atot2, &Aperm2, &Annz2, &Ajmap2, &Btot2, &Bperm2, &Bnnz2, &Bjmap2));
6652 
6653   /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */
6654   PetscInt *Ai, *Bi;
6655   PetscInt *Aj, *Bj;
6656 
6657   PetscCall(PetscMalloc1(m + 1, &Ai));
6658   PetscCall(PetscMalloc1(m + 1, &Bi));
6659   PetscCall(PetscMalloc1(Annz1 + Annz2, &Aj)); /* Since local and remote entries might have dups, we might allocate excess memory */
6660   PetscCall(PetscMalloc1(Bnnz1 + Bnnz2, &Bj));
6661 
6662   PetscCount *Aimap1, *Bimap1, *Aimap2, *Bimap2;
6663   PetscCall(PetscMalloc1(Annz1, &Aimap1));
6664   PetscCall(PetscMalloc1(Bnnz1, &Bimap1));
6665   PetscCall(PetscMalloc1(Annz2, &Aimap2));
6666   PetscCall(PetscMalloc1(Bnnz2, &Bimap2));
6667 
6668   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowBegin1, rowMid1, rowBegin2, rowMid2, Ajmap1, Ajmap2, Aimap1, Aimap2, Ai, Aj));
6669   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowMid1, rowEnd1, rowMid2, rowEnd2, Bjmap1, Bjmap2, Bimap1, Bimap2, Bi, Bj));
6670 
6671   /* Expand Ajmap1/Bjmap1 to make them based off nonzeros in A/B, since we     */
6672   /* expect nonzeros in A/B most likely have local contributing entries        */
6673   PetscInt    Annz = Ai[m];
6674   PetscInt    Bnnz = Bi[m];
6675   PetscCount *Ajmap1_new, *Bjmap1_new;
6676 
6677   PetscCall(PetscMalloc1(Annz + 1, &Ajmap1_new));
6678   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap1_new));
6679 
6680   PetscCall(ExpandJmap_Internal(Annz1, Annz, Aimap1, Ajmap1, Ajmap1_new));
6681   PetscCall(ExpandJmap_Internal(Bnnz1, Bnnz, Bimap1, Bjmap1, Bjmap1_new));
6682 
6683   PetscCall(PetscFree(Aimap1));
6684   PetscCall(PetscFree(Ajmap1));
6685   PetscCall(PetscFree(Bimap1));
6686   PetscCall(PetscFree(Bjmap1));
6687   PetscCall(PetscFree3(rowBegin1, rowMid1, rowEnd1));
6688   PetscCall(PetscFree3(rowBegin2, rowMid2, rowEnd2));
6689   PetscCall(PetscFree(perm1));
6690   PetscCall(PetscFree3(i2, j2, perm2));
6691 
6692   Ajmap1 = Ajmap1_new;
6693   Bjmap1 = Bjmap1_new;
6694 
6695   /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */
6696   if (Annz < Annz1 + Annz2) {
6697     PetscInt *Aj_new;
6698     PetscCall(PetscMalloc1(Annz, &Aj_new));
6699     PetscCall(PetscArraycpy(Aj_new, Aj, Annz));
6700     PetscCall(PetscFree(Aj));
6701     Aj = Aj_new;
6702   }
6703 
6704   if (Bnnz < Bnnz1 + Bnnz2) {
6705     PetscInt *Bj_new;
6706     PetscCall(PetscMalloc1(Bnnz, &Bj_new));
6707     PetscCall(PetscArraycpy(Bj_new, Bj, Bnnz));
6708     PetscCall(PetscFree(Bj));
6709     Bj = Bj_new;
6710   }
6711 
6712   /* Create new submatrices for on-process and off-process coupling                  */
6713   PetscScalar     *Aa, *Ba;
6714   MatType          rtype;
6715   Mat_SeqAIJ      *a, *b;
6716   PetscObjectState state;
6717   PetscCall(PetscCalloc1(Annz, &Aa)); /* Zero matrix on device */
6718   PetscCall(PetscCalloc1(Bnnz, &Ba));
6719   /* make Aj[] local, i.e, based off the start column of the diagonal portion */
6720   if (cstart) {
6721     for (k = 0; k < Annz; k++) Aj[k] -= cstart;
6722   }
6723 
6724   PetscCall(MatGetRootType_Private(mat, &rtype));
6725 
6726   MatSeqXAIJGetOptions_Private(mpiaij->A);
6727   PetscCall(MatDestroy(&mpiaij->A));
6728   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, Ai, Aj, Aa, &mpiaij->A));
6729   PetscCall(MatSetBlockSizesFromMats(mpiaij->A, mat, mat));
6730   MatSeqXAIJRestoreOptions_Private(mpiaij->A);
6731 
6732   MatSeqXAIJGetOptions_Private(mpiaij->B);
6733   PetscCall(MatDestroy(&mpiaij->B));
6734   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, mat->cmap->N, Bi, Bj, Ba, &mpiaij->B));
6735   PetscCall(MatSetBlockSizesFromMats(mpiaij->B, mat, mat));
6736   MatSeqXAIJRestoreOptions_Private(mpiaij->B);
6737 
6738   PetscCall(MatSetUpMultiply_MPIAIJ(mat));
6739   mat->was_assembled = PETSC_TRUE; // was_assembled in effect means the Mvctx is built; doing so avoids redundant MatSetUpMultiply_MPIAIJ
6740   state              = mpiaij->A->nonzerostate + mpiaij->B->nonzerostate;
6741   PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
6742 
6743   a          = (Mat_SeqAIJ *)mpiaij->A->data;
6744   b          = (Mat_SeqAIJ *)mpiaij->B->data;
6745   a->free_a  = PETSC_TRUE;
6746   a->free_ij = PETSC_TRUE;
6747   b->free_a  = PETSC_TRUE;
6748   b->free_ij = PETSC_TRUE;
6749   a->maxnz   = a->nz;
6750   b->maxnz   = b->nz;
6751 
6752   /* conversion must happen AFTER multiply setup */
6753   PetscCall(MatConvert(mpiaij->A, rtype, MAT_INPLACE_MATRIX, &mpiaij->A));
6754   PetscCall(MatConvert(mpiaij->B, rtype, MAT_INPLACE_MATRIX, &mpiaij->B));
6755   PetscCall(VecDestroy(&mpiaij->lvec));
6756   PetscCall(MatCreateVecs(mpiaij->B, &mpiaij->lvec, NULL));
6757 
6758   // Put the COO struct in a container and then attach that to the matrix
6759   PetscCall(PetscMalloc1(1, &coo));
6760   coo->n       = coo_n;
6761   coo->sf      = sf2;
6762   coo->sendlen = nleaves;
6763   coo->recvlen = nroots;
6764   coo->Annz    = Annz;
6765   coo->Bnnz    = Bnnz;
6766   coo->Annz2   = Annz2;
6767   coo->Bnnz2   = Bnnz2;
6768   coo->Atot1   = Atot1;
6769   coo->Atot2   = Atot2;
6770   coo->Btot1   = Btot1;
6771   coo->Btot2   = Btot2;
6772   coo->Ajmap1  = Ajmap1;
6773   coo->Aperm1  = Aperm1;
6774   coo->Bjmap1  = Bjmap1;
6775   coo->Bperm1  = Bperm1;
6776   coo->Aimap2  = Aimap2;
6777   coo->Ajmap2  = Ajmap2;
6778   coo->Aperm2  = Aperm2;
6779   coo->Bimap2  = Bimap2;
6780   coo->Bjmap2  = Bjmap2;
6781   coo->Bperm2  = Bperm2;
6782   coo->Cperm1  = Cperm1;
6783   // Allocate in preallocation. If not used, it has zero cost on host
6784   PetscCall(PetscMalloc2(coo->sendlen, &coo->sendbuf, coo->recvlen, &coo->recvbuf));
6785   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
6786   PetscCall(PetscContainerSetPointer(container, coo));
6787   PetscCall(PetscContainerSetCtxDestroy(container, MatCOOStructDestroy_MPIAIJ));
6788   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject)container));
6789   PetscCall(PetscContainerDestroy(&container));
6790   PetscFunctionReturn(PETSC_SUCCESS);
6791 }
6792 
6793 static PetscErrorCode MatSetValuesCOO_MPIAIJ(Mat mat, const PetscScalar v[], InsertMode imode)
6794 {
6795   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6796   Mat                  A = mpiaij->A, B = mpiaij->B;
6797   PetscScalar         *Aa, *Ba;
6798   PetscScalar         *sendbuf, *recvbuf;
6799   const PetscCount    *Ajmap1, *Ajmap2, *Aimap2;
6800   const PetscCount    *Bjmap1, *Bjmap2, *Bimap2;
6801   const PetscCount    *Aperm1, *Aperm2, *Bperm1, *Bperm2;
6802   const PetscCount    *Cperm1;
6803   PetscContainer       container;
6804   MatCOOStruct_MPIAIJ *coo;
6805 
6806   PetscFunctionBegin;
6807   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container));
6808   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
6809   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
6810   sendbuf = coo->sendbuf;
6811   recvbuf = coo->recvbuf;
6812   Ajmap1  = coo->Ajmap1;
6813   Ajmap2  = coo->Ajmap2;
6814   Aimap2  = coo->Aimap2;
6815   Bjmap1  = coo->Bjmap1;
6816   Bjmap2  = coo->Bjmap2;
6817   Bimap2  = coo->Bimap2;
6818   Aperm1  = coo->Aperm1;
6819   Aperm2  = coo->Aperm2;
6820   Bperm1  = coo->Bperm1;
6821   Bperm2  = coo->Bperm2;
6822   Cperm1  = coo->Cperm1;
6823 
6824   PetscCall(MatSeqAIJGetArray(A, &Aa)); /* Might read and write matrix values */
6825   PetscCall(MatSeqAIJGetArray(B, &Ba));
6826 
6827   /* Pack entries to be sent to remote */
6828   for (PetscCount i = 0; i < coo->sendlen; i++) sendbuf[i] = v[Cperm1[i]];
6829 
6830   /* Send remote entries to their owner and overlap the communication with local computation */
6831   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HOST, sendbuf, PETSC_MEMTYPE_HOST, recvbuf, MPI_REPLACE));
6832   /* Add local entries to A and B */
6833   for (PetscCount i = 0; i < coo->Annz; i++) { /* All nonzeros in A are either zero'ed or added with a value (i.e., initialized) */
6834     PetscScalar sum = 0.0;                     /* Do partial summation first to improve numerical stability */
6835     for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += v[Aperm1[k]];
6836     Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
6837   }
6838   for (PetscCount i = 0; i < coo->Bnnz; i++) {
6839     PetscScalar sum = 0.0;
6840     for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += v[Bperm1[k]];
6841     Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
6842   }
6843   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE));
6844 
6845   /* Add received remote entries to A and B */
6846   for (PetscCount i = 0; i < coo->Annz2; i++) {
6847     for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += recvbuf[Aperm2[k]];
6848   }
6849   for (PetscCount i = 0; i < coo->Bnnz2; i++) {
6850     for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += recvbuf[Bperm2[k]];
6851   }
6852   PetscCall(MatSeqAIJRestoreArray(A, &Aa));
6853   PetscCall(MatSeqAIJRestoreArray(B, &Ba));
6854   PetscFunctionReturn(PETSC_SUCCESS);
6855 }
6856 
6857 /*MC
6858    MATMPIAIJ - MATMPIAIJ = "mpiaij" - A matrix type to be used for parallel sparse matrices.
6859 
6860    Options Database Keys:
6861 . -mat_type mpiaij - sets the matrix type to `MATMPIAIJ` during a call to `MatSetFromOptions()`
6862 
6863    Level: beginner
6864 
6865    Notes:
6866    `MatSetValues()` may be called for this matrix type with a `NULL` argument for the numerical values,
6867     in this case the values associated with the rows and columns one passes in are set to zero
6868     in the matrix
6869 
6870     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
6871     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
6872 
6873 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJ`, `MATAIJ`, `MatCreateAIJ()`
6874 M*/
6875 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJ(Mat B)
6876 {
6877   Mat_MPIAIJ *b;
6878   PetscMPIInt size;
6879 
6880   PetscFunctionBegin;
6881   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
6882 
6883   PetscCall(PetscNew(&b));
6884   B->data       = (void *)b;
6885   B->ops[0]     = MatOps_Values;
6886   B->assembled  = PETSC_FALSE;
6887   B->insertmode = NOT_SET_VALUES;
6888   b->size       = size;
6889 
6890   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &b->rank));
6891 
6892   /* build cache for off array entries formed */
6893   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)B), 1, &B->stash));
6894 
6895   b->donotstash  = PETSC_FALSE;
6896   b->colmap      = NULL;
6897   b->garray      = NULL;
6898   b->roworiented = PETSC_TRUE;
6899 
6900   /* stuff used for matrix vector multiply */
6901   b->lvec  = NULL;
6902   b->Mvctx = NULL;
6903 
6904   /* stuff for MatGetRow() */
6905   b->rowindices   = NULL;
6906   b->rowvalues    = NULL;
6907   b->getrowactive = PETSC_FALSE;
6908 
6909   /* flexible pointer used in CUSPARSE classes */
6910   b->spptr = NULL;
6911 
6912   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetUseScalableIncreaseOverlap_C", MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ));
6913   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_MPIAIJ));
6914   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_MPIAIJ));
6915   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_MPIAIJ));
6916   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJ));
6917   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetPreallocation_C", MatResetPreallocation_MPIAIJ));
6918   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetHash_C", MatResetHash_MPIAIJ));
6919   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocationCSR_C", MatMPIAIJSetPreallocationCSR_MPIAIJ));
6920   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDiagonalScaleLocal_C", MatDiagonalScaleLocal_MPIAIJ));
6921   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijperm_C", MatConvert_MPIAIJ_MPIAIJPERM));
6922   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijsell_C", MatConvert_MPIAIJ_MPIAIJSELL));
6923 #if defined(PETSC_HAVE_CUDA)
6924   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcusparse_C", MatConvert_MPIAIJ_MPIAIJCUSPARSE));
6925 #endif
6926 #if defined(PETSC_HAVE_HIP)
6927   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijhipsparse_C", MatConvert_MPIAIJ_MPIAIJHIPSPARSE));
6928 #endif
6929 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6930   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijkokkos_C", MatConvert_MPIAIJ_MPIAIJKokkos));
6931 #endif
6932 #if defined(PETSC_HAVE_MKL_SPARSE)
6933   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijmkl_C", MatConvert_MPIAIJ_MPIAIJMKL));
6934 #endif
6935   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcrl_C", MatConvert_MPIAIJ_MPIAIJCRL));
6936   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpibaij_C", MatConvert_MPIAIJ_MPIBAIJ));
6937   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisbaij_C", MatConvert_MPIAIJ_MPISBAIJ));
6938   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
6939 #if defined(PETSC_HAVE_ELEMENTAL)
6940   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_elemental_C", MatConvert_MPIAIJ_Elemental));
6941 #endif
6942 #if defined(PETSC_HAVE_SCALAPACK)
6943   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_scalapack_C", MatConvert_AIJ_ScaLAPACK));
6944 #endif
6945   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_is_C", MatConvert_XAIJ_IS));
6946   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisell_C", MatConvert_MPIAIJ_MPISELL));
6947 #if defined(PETSC_HAVE_HYPRE)
6948   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_hypre_C", MatConvert_AIJ_HYPRE));
6949   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", MatProductSetFromOptions_Transpose_AIJ_AIJ));
6950 #endif
6951   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_is_mpiaij_C", MatProductSetFromOptions_IS_XAIJ));
6952   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_mpiaij_mpiaij_C", MatProductSetFromOptions_MPIAIJ));
6953   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJ));
6954   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJ));
6955   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJ));
6956   PetscFunctionReturn(PETSC_SUCCESS);
6957 }
6958 
6959 /*@
6960   MatCreateMPIAIJWithSplitArrays - creates a `MATMPIAIJ` matrix using arrays that contain the "diagonal"
6961   and "off-diagonal" part of the matrix in CSR format.
6962 
6963   Collective
6964 
6965   Input Parameters:
6966 + comm - MPI communicator
6967 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
6968 . n    - This value should be the same as the local size used in creating the
6969          x vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
6970          calculated if `N` is given) For square matrices `n` is almost always `m`.
6971 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
6972 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
6973 . i    - row indices for "diagonal" portion of matrix; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
6974 . j    - column indices, which must be local, i.e., based off the start column of the diagonal portion
6975 . a    - matrix values
6976 . oi   - row indices for "off-diagonal" portion of matrix; that is oi[0] = 0, oi[row] = oi[row-1] + number of elements in that row of the matrix
6977 . oj   - column indices, which must be global, representing global columns in the `MATMPIAIJ` matrix
6978 - oa   - matrix values
6979 
6980   Output Parameter:
6981 . mat - the matrix
6982 
6983   Level: advanced
6984 
6985   Notes:
6986   The `i`, `j`, and `a` arrays ARE NOT copied by this routine into the internal format used by PETSc (even in Fortran). The user
6987   must free the arrays once the matrix has been destroyed and not before.
6988 
6989   The `i` and `j` indices are 0 based
6990 
6991   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix
6992 
6993   This sets local rows and cannot be used to set off-processor values.
6994 
6995   Use of this routine is discouraged because it is inflexible and cumbersome to use. It is extremely rare that a
6996   legacy application natively assembles into exactly this split format. The code to do so is nontrivial and does
6997   not easily support in-place reassembly. It is recommended to use MatSetValues() (or a variant thereof) because
6998   the resulting assembly is easier to implement, will work with any matrix format, and the user does not have to
6999   keep track of the underlying array. Use `MatSetOption`(A,`MAT_NO_OFF_PROC_ENTRIES`,`PETSC_TRUE`) to disable all
7000   communication if it is known that only local entries will be set.
7001 
7002 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
7003           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithArrays()`
7004 @*/
7005 PetscErrorCode MatCreateMPIAIJWithSplitArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt i[], PetscInt j[], PetscScalar a[], PetscInt oi[], PetscInt oj[], PetscScalar oa[], Mat *mat)
7006 {
7007   Mat_MPIAIJ *maij;
7008 
7009   PetscFunctionBegin;
7010   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
7011   PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
7012   PetscCheck(oi[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "oi (row indices) must start with 0");
7013   PetscCall(MatCreate(comm, mat));
7014   PetscCall(MatSetSizes(*mat, m, n, M, N));
7015   PetscCall(MatSetType(*mat, MATMPIAIJ));
7016   maij = (Mat_MPIAIJ *)(*mat)->data;
7017 
7018   (*mat)->preallocated = PETSC_TRUE;
7019 
7020   PetscCall(PetscLayoutSetUp((*mat)->rmap));
7021   PetscCall(PetscLayoutSetUp((*mat)->cmap));
7022 
7023   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, i, j, a, &maij->A));
7024   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, (*mat)->cmap->N, oi, oj, oa, &maij->B));
7025 
7026   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
7027   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
7028   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
7029   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
7030   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
7031   PetscFunctionReturn(PETSC_SUCCESS);
7032 }
7033 
7034 typedef struct {
7035   Mat       *mp;    /* intermediate products */
7036   PetscBool *mptmp; /* is the intermediate product temporary ? */
7037   PetscInt   cp;    /* number of intermediate products */
7038 
7039   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
7040   PetscInt    *startsj_s, *startsj_r;
7041   PetscScalar *bufa;
7042   Mat          P_oth;
7043 
7044   /* may take advantage of merging product->B */
7045   Mat Bloc; /* B-local by merging diag and off-diag */
7046 
7047   /* cusparse does not have support to split between symbolic and numeric phases.
7048      When api_user is true, we don't need to update the numerical values
7049      of the temporary storage */
7050   PetscBool reusesym;
7051 
7052   /* support for COO values insertion */
7053   PetscScalar *coo_v, *coo_w; /* store on-process and off-process COO scalars, and used as MPI recv/send buffers respectively */
7054   PetscInt   **own;           /* own[i] points to address of on-process COO indices for Mat mp[i] */
7055   PetscInt   **off;           /* off[i] points to address of off-process COO indices for Mat mp[i] */
7056   PetscBool    hasoffproc;    /* if true, have off-process values insertion (i.e. AtB or PtAP) */
7057   PetscSF      sf;            /* used for non-local values insertion and memory malloc */
7058   PetscMemType mtype;
7059 
7060   /* customization */
7061   PetscBool abmerge;
7062   PetscBool P_oth_bind;
7063 } MatMatMPIAIJBACKEND;
7064 
7065 static PetscErrorCode MatDestroy_MatMatMPIAIJBACKEND(void *data)
7066 {
7067   MatMatMPIAIJBACKEND *mmdata = (MatMatMPIAIJBACKEND *)data;
7068   PetscInt             i;
7069 
7070   PetscFunctionBegin;
7071   PetscCall(PetscFree2(mmdata->startsj_s, mmdata->startsj_r));
7072   PetscCall(PetscFree(mmdata->bufa));
7073   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_v));
7074   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_w));
7075   PetscCall(MatDestroy(&mmdata->P_oth));
7076   PetscCall(MatDestroy(&mmdata->Bloc));
7077   PetscCall(PetscSFDestroy(&mmdata->sf));
7078   for (i = 0; i < mmdata->cp; i++) PetscCall(MatDestroy(&mmdata->mp[i]));
7079   PetscCall(PetscFree2(mmdata->mp, mmdata->mptmp));
7080   PetscCall(PetscFree(mmdata->own[0]));
7081   PetscCall(PetscFree(mmdata->own));
7082   PetscCall(PetscFree(mmdata->off[0]));
7083   PetscCall(PetscFree(mmdata->off));
7084   PetscCall(PetscFree(mmdata));
7085   PetscFunctionReturn(PETSC_SUCCESS);
7086 }
7087 
7088 /* Copy selected n entries with indices in idx[] of A to v[].
7089    If idx is NULL, copy the whole data array of A to v[]
7090  */
7091 static PetscErrorCode MatSeqAIJCopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
7092 {
7093   PetscErrorCode (*f)(Mat, PetscInt, const PetscInt[], PetscScalar[]);
7094 
7095   PetscFunctionBegin;
7096   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", &f));
7097   if (f) {
7098     PetscCall((*f)(A, n, idx, v));
7099   } else {
7100     const PetscScalar *vv;
7101 
7102     PetscCall(MatSeqAIJGetArrayRead(A, &vv));
7103     if (n && idx) {
7104       PetscScalar    *w  = v;
7105       const PetscInt *oi = idx;
7106       PetscInt        j;
7107 
7108       for (j = 0; j < n; j++) *w++ = vv[*oi++];
7109     } else {
7110       PetscCall(PetscArraycpy(v, vv, n));
7111     }
7112     PetscCall(MatSeqAIJRestoreArrayRead(A, &vv));
7113   }
7114   PetscFunctionReturn(PETSC_SUCCESS);
7115 }
7116 
7117 static PetscErrorCode MatProductNumeric_MPIAIJBACKEND(Mat C)
7118 {
7119   MatMatMPIAIJBACKEND *mmdata;
7120   PetscInt             i, n_d, n_o;
7121 
7122   PetscFunctionBegin;
7123   MatCheckProduct(C, 1);
7124   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
7125   mmdata = (MatMatMPIAIJBACKEND *)C->product->data;
7126   if (!mmdata->reusesym) { /* update temporary matrices */
7127     if (mmdata->P_oth) PetscCall(MatGetBrowsOfAoCols_MPIAIJ(C->product->A, C->product->B, MAT_REUSE_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7128     if (mmdata->Bloc) PetscCall(MatMPIAIJGetLocalMatMerge(C->product->B, MAT_REUSE_MATRIX, NULL, &mmdata->Bloc));
7129   }
7130   mmdata->reusesym = PETSC_FALSE;
7131 
7132   for (i = 0; i < mmdata->cp; i++) {
7133     PetscCheck(mmdata->mp[i]->ops->productnumeric, PetscObjectComm((PetscObject)mmdata->mp[i]), PETSC_ERR_PLIB, "Missing numeric op for %s", MatProductTypes[mmdata->mp[i]->product->type]);
7134     PetscCall((*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]));
7135   }
7136   for (i = 0, n_d = 0, n_o = 0; i < mmdata->cp; i++) {
7137     PetscInt noff;
7138 
7139     PetscCall(PetscIntCast(mmdata->off[i + 1] - mmdata->off[i], &noff));
7140     if (mmdata->mptmp[i]) continue;
7141     if (noff) {
7142       PetscInt nown;
7143 
7144       PetscCall(PetscIntCast(mmdata->own[i + 1] - mmdata->own[i], &nown));
7145       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], noff, mmdata->off[i], mmdata->coo_w + n_o));
7146       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], nown, mmdata->own[i], mmdata->coo_v + n_d));
7147       n_o += noff;
7148       n_d += nown;
7149     } else {
7150       Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mmdata->mp[i]->data;
7151 
7152       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], mm->nz, NULL, mmdata->coo_v + n_d));
7153       n_d += mm->nz;
7154     }
7155   }
7156   if (mmdata->hasoffproc) { /* offprocess insertion */
7157     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7158     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7159   }
7160   PetscCall(MatSetValuesCOO(C, mmdata->coo_v, INSERT_VALUES));
7161   PetscFunctionReturn(PETSC_SUCCESS);
7162 }
7163 
7164 /* Support for Pt * A, A * P, or Pt * A * P */
7165 #define MAX_NUMBER_INTERMEDIATE 4
7166 PetscErrorCode MatProductSymbolic_MPIAIJBACKEND(Mat C)
7167 {
7168   Mat_Product           *product = C->product;
7169   Mat                    A, P, mp[MAX_NUMBER_INTERMEDIATE]; /* A, P and a series of intermediate matrices */
7170   Mat_MPIAIJ            *a, *p;
7171   MatMatMPIAIJBACKEND   *mmdata;
7172   ISLocalToGlobalMapping P_oth_l2g = NULL;
7173   IS                     glob      = NULL;
7174   const char            *prefix;
7175   char                   pprefix[256];
7176   const PetscInt        *globidx, *P_oth_idx;
7177   PetscInt               i, j, cp, m, n, M, N, *coo_i, *coo_j;
7178   PetscCount             ncoo, ncoo_d, ncoo_o, ncoo_oown;
7179   PetscInt               cmapt[MAX_NUMBER_INTERMEDIATE], rmapt[MAX_NUMBER_INTERMEDIATE]; /* col/row map type for each Mat in mp[]. */
7180                                                                                          /* type-0: consecutive, start from 0; type-1: consecutive with */
7181                                                                                          /* a base offset; type-2: sparse with a local to global map table */
7182   const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE], *rmapa[MAX_NUMBER_INTERMEDIATE];       /* col/row local to global map array (table) for type-2 map type */
7183 
7184   MatProductType ptype;
7185   PetscBool      mptmp[MAX_NUMBER_INTERMEDIATE], hasoffproc = PETSC_FALSE, iscuda, iship, iskokk;
7186   PetscMPIInt    size;
7187 
7188   PetscFunctionBegin;
7189   MatCheckProduct(C, 1);
7190   PetscCheck(!product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
7191   ptype = product->type;
7192   if (product->A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
7193     ptype                                          = MATPRODUCT_AB;
7194     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
7195   }
7196   switch (ptype) {
7197   case MATPRODUCT_AB:
7198     A          = product->A;
7199     P          = product->B;
7200     m          = A->rmap->n;
7201     n          = P->cmap->n;
7202     M          = A->rmap->N;
7203     N          = P->cmap->N;
7204     hasoffproc = PETSC_FALSE; /* will not scatter mat product values to other processes */
7205     break;
7206   case MATPRODUCT_AtB:
7207     P          = product->A;
7208     A          = product->B;
7209     m          = P->cmap->n;
7210     n          = A->cmap->n;
7211     M          = P->cmap->N;
7212     N          = A->cmap->N;
7213     hasoffproc = PETSC_TRUE;
7214     break;
7215   case MATPRODUCT_PtAP:
7216     A          = product->A;
7217     P          = product->B;
7218     m          = P->cmap->n;
7219     n          = P->cmap->n;
7220     M          = P->cmap->N;
7221     N          = P->cmap->N;
7222     hasoffproc = PETSC_TRUE;
7223     break;
7224   default:
7225     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7226   }
7227   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)C), &size));
7228   if (size == 1) hasoffproc = PETSC_FALSE;
7229 
7230   /* defaults */
7231   for (i = 0; i < MAX_NUMBER_INTERMEDIATE; i++) {
7232     mp[i]    = NULL;
7233     mptmp[i] = PETSC_FALSE;
7234     rmapt[i] = -1;
7235     cmapt[i] = -1;
7236     rmapa[i] = NULL;
7237     cmapa[i] = NULL;
7238   }
7239 
7240   /* customization */
7241   PetscCall(PetscNew(&mmdata));
7242   mmdata->reusesym = product->api_user;
7243   if (ptype == MATPRODUCT_AB) {
7244     if (product->api_user) {
7245       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
7246       PetscCall(PetscOptionsBool("-matmatmult_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7247       PetscCall(PetscOptionsBool("-matmatmult_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7248       PetscOptionsEnd();
7249     } else {
7250       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
7251       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7252       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7253       PetscOptionsEnd();
7254     }
7255   } else if (ptype == MATPRODUCT_PtAP) {
7256     if (product->api_user) {
7257       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
7258       PetscCall(PetscOptionsBool("-matptap_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7259       PetscOptionsEnd();
7260     } else {
7261       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
7262       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7263       PetscOptionsEnd();
7264     }
7265   }
7266   a = (Mat_MPIAIJ *)A->data;
7267   p = (Mat_MPIAIJ *)P->data;
7268   PetscCall(MatSetSizes(C, m, n, M, N));
7269   PetscCall(PetscLayoutSetUp(C->rmap));
7270   PetscCall(PetscLayoutSetUp(C->cmap));
7271   PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
7272   PetscCall(MatGetOptionsPrefix(C, &prefix));
7273 
7274   cp = 0;
7275   switch (ptype) {
7276   case MATPRODUCT_AB: /* A * P */
7277     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7278 
7279     /* A_diag * P_local (merged or not) */
7280     if (mmdata->abmerge) { /* P's diagonal and off-diag blocks are merged to one matrix, then multiplied by A_diag */
7281       /* P is product->B */
7282       PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7283       PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7284       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7285       PetscCall(MatProductSetFill(mp[cp], product->fill));
7286       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7287       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7288       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7289       mp[cp]->product->api_user = product->api_user;
7290       PetscCall(MatProductSetFromOptions(mp[cp]));
7291       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7292       PetscCall(ISGetIndices(glob, &globidx));
7293       rmapt[cp] = 1;
7294       cmapt[cp] = 2;
7295       cmapa[cp] = globidx;
7296       mptmp[cp] = PETSC_FALSE;
7297       cp++;
7298     } else { /* A_diag * P_diag and A_diag * P_off */
7299       PetscCall(MatProductCreate(a->A, p->A, NULL, &mp[cp]));
7300       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7301       PetscCall(MatProductSetFill(mp[cp], product->fill));
7302       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7303       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7304       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7305       mp[cp]->product->api_user = product->api_user;
7306       PetscCall(MatProductSetFromOptions(mp[cp]));
7307       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7308       rmapt[cp] = 1;
7309       cmapt[cp] = 1;
7310       mptmp[cp] = PETSC_FALSE;
7311       cp++;
7312       PetscCall(MatProductCreate(a->A, p->B, NULL, &mp[cp]));
7313       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7314       PetscCall(MatProductSetFill(mp[cp], product->fill));
7315       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7316       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7317       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7318       mp[cp]->product->api_user = product->api_user;
7319       PetscCall(MatProductSetFromOptions(mp[cp]));
7320       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7321       rmapt[cp] = 1;
7322       cmapt[cp] = 2;
7323       cmapa[cp] = p->garray;
7324       mptmp[cp] = PETSC_FALSE;
7325       cp++;
7326     }
7327 
7328     /* A_off * P_other */
7329     if (mmdata->P_oth) {
7330       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g)); /* make P_oth use local col ids */
7331       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7332       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7333       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7334       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7335       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7336       PetscCall(MatProductSetFill(mp[cp], product->fill));
7337       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7338       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7339       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7340       mp[cp]->product->api_user = product->api_user;
7341       PetscCall(MatProductSetFromOptions(mp[cp]));
7342       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7343       rmapt[cp] = 1;
7344       cmapt[cp] = 2;
7345       cmapa[cp] = P_oth_idx;
7346       mptmp[cp] = PETSC_FALSE;
7347       cp++;
7348     }
7349     break;
7350 
7351   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
7352     /* A is product->B */
7353     PetscCall(MatMPIAIJGetLocalMatMerge(A, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7354     if (A == P) { /* when A==P, we can take advantage of the already merged mmdata->Bloc */
7355       PetscCall(MatProductCreate(mmdata->Bloc, mmdata->Bloc, NULL, &mp[cp]));
7356       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7357       PetscCall(MatProductSetFill(mp[cp], product->fill));
7358       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7359       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7360       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7361       mp[cp]->product->api_user = product->api_user;
7362       PetscCall(MatProductSetFromOptions(mp[cp]));
7363       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7364       PetscCall(ISGetIndices(glob, &globidx));
7365       rmapt[cp] = 2;
7366       rmapa[cp] = globidx;
7367       cmapt[cp] = 2;
7368       cmapa[cp] = globidx;
7369       mptmp[cp] = PETSC_FALSE;
7370       cp++;
7371     } else {
7372       PetscCall(MatProductCreate(p->A, mmdata->Bloc, NULL, &mp[cp]));
7373       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7374       PetscCall(MatProductSetFill(mp[cp], product->fill));
7375       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7376       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7377       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7378       mp[cp]->product->api_user = product->api_user;
7379       PetscCall(MatProductSetFromOptions(mp[cp]));
7380       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7381       PetscCall(ISGetIndices(glob, &globidx));
7382       rmapt[cp] = 1;
7383       cmapt[cp] = 2;
7384       cmapa[cp] = globidx;
7385       mptmp[cp] = PETSC_FALSE;
7386       cp++;
7387       PetscCall(MatProductCreate(p->B, mmdata->Bloc, NULL, &mp[cp]));
7388       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7389       PetscCall(MatProductSetFill(mp[cp], product->fill));
7390       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7391       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7392       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7393       mp[cp]->product->api_user = product->api_user;
7394       PetscCall(MatProductSetFromOptions(mp[cp]));
7395       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7396       rmapt[cp] = 2;
7397       rmapa[cp] = p->garray;
7398       cmapt[cp] = 2;
7399       cmapa[cp] = globidx;
7400       mptmp[cp] = PETSC_FALSE;
7401       cp++;
7402     }
7403     break;
7404   case MATPRODUCT_PtAP:
7405     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7406     /* P is product->B */
7407     PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7408     PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7409     PetscCall(MatProductSetType(mp[cp], MATPRODUCT_PtAP));
7410     PetscCall(MatProductSetFill(mp[cp], product->fill));
7411     PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7412     PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7413     PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7414     mp[cp]->product->api_user = product->api_user;
7415     PetscCall(MatProductSetFromOptions(mp[cp]));
7416     PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7417     PetscCall(ISGetIndices(glob, &globidx));
7418     rmapt[cp] = 2;
7419     rmapa[cp] = globidx;
7420     cmapt[cp] = 2;
7421     cmapa[cp] = globidx;
7422     mptmp[cp] = PETSC_FALSE;
7423     cp++;
7424     if (mmdata->P_oth) {
7425       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g));
7426       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7427       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7428       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7429       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7430       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7431       PetscCall(MatProductSetFill(mp[cp], product->fill));
7432       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7433       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7434       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7435       mp[cp]->product->api_user = product->api_user;
7436       PetscCall(MatProductSetFromOptions(mp[cp]));
7437       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7438       mptmp[cp] = PETSC_TRUE;
7439       cp++;
7440       PetscCall(MatProductCreate(mmdata->Bloc, mp[1], NULL, &mp[cp]));
7441       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7442       PetscCall(MatProductSetFill(mp[cp], product->fill));
7443       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7444       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7445       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7446       mp[cp]->product->api_user = product->api_user;
7447       PetscCall(MatProductSetFromOptions(mp[cp]));
7448       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7449       rmapt[cp] = 2;
7450       rmapa[cp] = globidx;
7451       cmapt[cp] = 2;
7452       cmapa[cp] = P_oth_idx;
7453       mptmp[cp] = PETSC_FALSE;
7454       cp++;
7455     }
7456     break;
7457   default:
7458     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7459   }
7460   /* sanity check */
7461   if (size > 1)
7462     for (i = 0; i < cp; i++) PetscCheck(rmapt[i] != 2 || hasoffproc, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected offproc map type for product %" PetscInt_FMT, i);
7463 
7464   PetscCall(PetscMalloc2(cp, &mmdata->mp, cp, &mmdata->mptmp));
7465   for (i = 0; i < cp; i++) {
7466     mmdata->mp[i]    = mp[i];
7467     mmdata->mptmp[i] = mptmp[i];
7468   }
7469   mmdata->cp             = cp;
7470   C->product->data       = mmdata;
7471   C->product->destroy    = MatDestroy_MatMatMPIAIJBACKEND;
7472   C->ops->productnumeric = MatProductNumeric_MPIAIJBACKEND;
7473 
7474   /* memory type */
7475   mmdata->mtype = PETSC_MEMTYPE_HOST;
7476   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iscuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, ""));
7477   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iship, MATSEQAIJHIPSPARSE, MATMPIAIJHIPSPARSE, ""));
7478   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iskokk, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, ""));
7479   if (iscuda) mmdata->mtype = PETSC_MEMTYPE_CUDA;
7480   else if (iship) mmdata->mtype = PETSC_MEMTYPE_HIP;
7481   else if (iskokk) mmdata->mtype = PETSC_MEMTYPE_KOKKOS;
7482 
7483   /* prepare coo coordinates for values insertion */
7484 
7485   /* count total nonzeros of those intermediate seqaij Mats
7486     ncoo_d:    # of nonzeros of matrices that do not have offproc entries
7487     ncoo_o:    # of nonzeros (of matrices that might have offproc entries) that will be inserted to remote procs
7488     ncoo_oown: # of nonzeros (of matrices that might have offproc entries) that will be inserted locally
7489   */
7490   for (cp = 0, ncoo_d = 0, ncoo_o = 0, ncoo_oown = 0; cp < mmdata->cp; cp++) {
7491     Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mp[cp]->data;
7492     if (mptmp[cp]) continue;
7493     if (rmapt[cp] == 2 && hasoffproc) { /* the rows need to be scatter to all processes (might include self) */
7494       const PetscInt *rmap = rmapa[cp];
7495       const PetscInt  mr   = mp[cp]->rmap->n;
7496       const PetscInt  rs   = C->rmap->rstart;
7497       const PetscInt  re   = C->rmap->rend;
7498       const PetscInt *ii   = mm->i;
7499       for (i = 0; i < mr; i++) {
7500         const PetscInt gr = rmap[i];
7501         const PetscInt nz = ii[i + 1] - ii[i];
7502         if (gr < rs || gr >= re) ncoo_o += nz; /* this row is offproc */
7503         else ncoo_oown += nz;                  /* this row is local */
7504       }
7505     } else ncoo_d += mm->nz;
7506   }
7507 
7508   /*
7509     ncoo: total number of nonzeros (including those inserted by remote procs) belonging to this proc
7510 
7511     ncoo = ncoo_d + ncoo_oown + ncoo2, which ncoo2 is number of nonzeros inserted to me by other procs.
7512 
7513     off[0] points to a big index array, which is shared by off[1,2,...]. Similarly, for own[0].
7514 
7515     off[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert to others
7516     own[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert locally
7517     so, off[p+1]-off[p] is the number of nonzeros that mp[p] will send to others.
7518 
7519     coo_i/j/v[]: [ncoo] row/col/val of nonzeros belonging to this proc.
7520     Ex. coo_i[]: the beginning part (of size ncoo_d + ncoo_oown) stores i of local nonzeros, and the remaining part stores i of nonzeros I will receive.
7521   */
7522   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->off)); /* +1 to make a csr-like data structure */
7523   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->own));
7524 
7525   /* gather (i,j) of nonzeros inserted by remote procs */
7526   if (hasoffproc) {
7527     PetscSF  msf;
7528     PetscInt ncoo2, *coo_i2, *coo_j2;
7529 
7530     PetscCall(PetscMalloc1(ncoo_o, &mmdata->off[0]));
7531     PetscCall(PetscMalloc1(ncoo_oown, &mmdata->own[0]));
7532     PetscCall(PetscMalloc2(ncoo_o, &coo_i, ncoo_o, &coo_j)); /* to collect (i,j) of entries to be sent to others */
7533 
7534     for (cp = 0, ncoo_o = 0; cp < mmdata->cp; cp++) {
7535       Mat_SeqAIJ *mm     = (Mat_SeqAIJ *)mp[cp]->data;
7536       PetscInt   *idxoff = mmdata->off[cp];
7537       PetscInt   *idxown = mmdata->own[cp];
7538       if (!mptmp[cp] && rmapt[cp] == 2) { /* row map is sparse */
7539         const PetscInt *rmap = rmapa[cp];
7540         const PetscInt *cmap = cmapa[cp];
7541         const PetscInt *ii   = mm->i;
7542         PetscInt       *coi  = coo_i + ncoo_o;
7543         PetscInt       *coj  = coo_j + ncoo_o;
7544         const PetscInt  mr   = mp[cp]->rmap->n;
7545         const PetscInt  rs   = C->rmap->rstart;
7546         const PetscInt  re   = C->rmap->rend;
7547         const PetscInt  cs   = C->cmap->rstart;
7548         for (i = 0; i < mr; i++) {
7549           const PetscInt *jj = mm->j + ii[i];
7550           const PetscInt  gr = rmap[i];
7551           const PetscInt  nz = ii[i + 1] - ii[i];
7552           if (gr < rs || gr >= re) { /* this is an offproc row */
7553             for (j = ii[i]; j < ii[i + 1]; j++) {
7554               *coi++    = gr;
7555               *idxoff++ = j;
7556             }
7557             if (!cmapt[cp]) { /* already global */
7558               for (j = 0; j < nz; j++) *coj++ = jj[j];
7559             } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7560               for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7561             } else { /* offdiag */
7562               for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7563             }
7564             ncoo_o += nz;
7565           } else { /* this is a local row */
7566             for (j = ii[i]; j < ii[i + 1]; j++) *idxown++ = j;
7567           }
7568         }
7569       }
7570       mmdata->off[cp + 1] = idxoff;
7571       mmdata->own[cp + 1] = idxown;
7572     }
7573 
7574     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7575     PetscInt incoo_o;
7576     PetscCall(PetscIntCast(ncoo_o, &incoo_o));
7577     PetscCall(PetscSFSetGraphLayout(mmdata->sf, C->rmap, incoo_o /*nleaves*/, NULL /*ilocal*/, PETSC_OWN_POINTER, coo_i));
7578     PetscCall(PetscSFGetMultiSF(mmdata->sf, &msf));
7579     PetscCall(PetscSFGetGraph(msf, &ncoo2 /*nroots*/, NULL, NULL, NULL));
7580     ncoo = ncoo_d + ncoo_oown + ncoo2;
7581     PetscCall(PetscMalloc2(ncoo, &coo_i2, ncoo, &coo_j2));
7582     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown)); /* put (i,j) of remote nonzeros at back */
7583     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown));
7584     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7585     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7586     PetscCall(PetscFree2(coo_i, coo_j));
7587     /* allocate MPI send buffer to collect nonzero values to be sent to remote procs */
7588     PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo_o * sizeof(PetscScalar), (void **)&mmdata->coo_w));
7589     coo_i = coo_i2;
7590     coo_j = coo_j2;
7591   } else { /* no offproc values insertion */
7592     ncoo = ncoo_d;
7593     PetscCall(PetscMalloc2(ncoo, &coo_i, ncoo, &coo_j));
7594 
7595     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7596     PetscCall(PetscSFSetGraph(mmdata->sf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
7597     PetscCall(PetscSFSetUp(mmdata->sf));
7598   }
7599   mmdata->hasoffproc = hasoffproc;
7600 
7601   /* gather (i,j) of nonzeros inserted locally */
7602   for (cp = 0, ncoo_d = 0; cp < mmdata->cp; cp++) {
7603     Mat_SeqAIJ     *mm   = (Mat_SeqAIJ *)mp[cp]->data;
7604     PetscInt       *coi  = coo_i + ncoo_d;
7605     PetscInt       *coj  = coo_j + ncoo_d;
7606     const PetscInt *jj   = mm->j;
7607     const PetscInt *ii   = mm->i;
7608     const PetscInt *cmap = cmapa[cp];
7609     const PetscInt *rmap = rmapa[cp];
7610     const PetscInt  mr   = mp[cp]->rmap->n;
7611     const PetscInt  rs   = C->rmap->rstart;
7612     const PetscInt  re   = C->rmap->rend;
7613     const PetscInt  cs   = C->cmap->rstart;
7614 
7615     if (mptmp[cp]) continue;
7616     if (rmapt[cp] == 1) { /* consecutive rows */
7617       /* fill coo_i */
7618       for (i = 0; i < mr; i++) {
7619         const PetscInt gr = i + rs;
7620         for (j = ii[i]; j < ii[i + 1]; j++) coi[j] = gr;
7621       }
7622       /* fill coo_j */
7623       if (!cmapt[cp]) { /* type-0, already global */
7624         PetscCall(PetscArraycpy(coj, jj, mm->nz));
7625       } else if (cmapt[cp] == 1) {                        /* type-1, local to global for consecutive columns of C */
7626         for (j = 0; j < mm->nz; j++) coj[j] = jj[j] + cs; /* lid + col start */
7627       } else {                                            /* type-2, local to global for sparse columns */
7628         for (j = 0; j < mm->nz; j++) coj[j] = cmap[jj[j]];
7629       }
7630       ncoo_d += mm->nz;
7631     } else if (rmapt[cp] == 2) { /* sparse rows */
7632       for (i = 0; i < mr; i++) {
7633         const PetscInt *jj = mm->j + ii[i];
7634         const PetscInt  gr = rmap[i];
7635         const PetscInt  nz = ii[i + 1] - ii[i];
7636         if (gr >= rs && gr < re) { /* local rows */
7637           for (j = ii[i]; j < ii[i + 1]; j++) *coi++ = gr;
7638           if (!cmapt[cp]) { /* type-0, already global */
7639             for (j = 0; j < nz; j++) *coj++ = jj[j];
7640           } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7641             for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7642           } else { /* type-2, local to global for sparse columns */
7643             for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7644           }
7645           ncoo_d += nz;
7646         }
7647       }
7648     }
7649   }
7650   if (glob) PetscCall(ISRestoreIndices(glob, &globidx));
7651   PetscCall(ISDestroy(&glob));
7652   if (P_oth_l2g) PetscCall(ISLocalToGlobalMappingRestoreIndices(P_oth_l2g, &P_oth_idx));
7653   PetscCall(ISLocalToGlobalMappingDestroy(&P_oth_l2g));
7654   /* allocate an array to store all nonzeros (inserted locally or remotely) belonging to this proc */
7655   PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo * sizeof(PetscScalar), (void **)&mmdata->coo_v));
7656 
7657   /* set block sizes */
7658   A = product->A;
7659   P = product->B;
7660   switch (ptype) {
7661   case MATPRODUCT_PtAP:
7662     if (P->cmap->bs > 1) PetscCall(MatSetBlockSizes(C, P->cmap->bs, P->cmap->bs));
7663     break;
7664   case MATPRODUCT_RARt:
7665     if (P->rmap->bs > 1) PetscCall(MatSetBlockSizes(C, P->rmap->bs, P->rmap->bs));
7666     break;
7667   case MATPRODUCT_ABC:
7668     PetscCall(MatSetBlockSizesFromMats(C, A, product->C));
7669     break;
7670   case MATPRODUCT_AB:
7671     PetscCall(MatSetBlockSizesFromMats(C, A, P));
7672     break;
7673   case MATPRODUCT_AtB:
7674     if (A->cmap->bs > 1 || P->cmap->bs > 1) PetscCall(MatSetBlockSizes(C, A->cmap->bs, P->cmap->bs));
7675     break;
7676   case MATPRODUCT_ABt:
7677     if (A->rmap->bs > 1 || P->rmap->bs > 1) PetscCall(MatSetBlockSizes(C, A->rmap->bs, P->rmap->bs));
7678     break;
7679   default:
7680     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for ProductType %s", MatProductTypes[ptype]);
7681   }
7682 
7683   /* preallocate with COO data */
7684   PetscCall(MatSetPreallocationCOO(C, ncoo, coo_i, coo_j));
7685   PetscCall(PetscFree2(coo_i, coo_j));
7686   PetscFunctionReturn(PETSC_SUCCESS);
7687 }
7688 
7689 PetscErrorCode MatProductSetFromOptions_MPIAIJBACKEND(Mat mat)
7690 {
7691   Mat_Product *product = mat->product;
7692 #if defined(PETSC_HAVE_DEVICE)
7693   PetscBool match  = PETSC_FALSE;
7694   PetscBool usecpu = PETSC_FALSE;
7695 #else
7696   PetscBool match = PETSC_TRUE;
7697 #endif
7698 
7699   PetscFunctionBegin;
7700   MatCheckProduct(mat, 1);
7701 #if defined(PETSC_HAVE_DEVICE)
7702   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match));
7703   if (match) { /* we can always fallback to the CPU if requested */
7704     switch (product->type) {
7705     case MATPRODUCT_AB:
7706       if (product->api_user) {
7707         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
7708         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7709         PetscOptionsEnd();
7710       } else {
7711         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
7712         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7713         PetscOptionsEnd();
7714       }
7715       break;
7716     case MATPRODUCT_AtB:
7717       if (product->api_user) {
7718         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
7719         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7720         PetscOptionsEnd();
7721       } else {
7722         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
7723         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7724         PetscOptionsEnd();
7725       }
7726       break;
7727     case MATPRODUCT_PtAP:
7728       if (product->api_user) {
7729         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
7730         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7731         PetscOptionsEnd();
7732       } else {
7733         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
7734         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7735         PetscOptionsEnd();
7736       }
7737       break;
7738     default:
7739       break;
7740     }
7741     match = (PetscBool)!usecpu;
7742   }
7743 #endif
7744   if (match) {
7745     switch (product->type) {
7746     case MATPRODUCT_AB:
7747     case MATPRODUCT_AtB:
7748     case MATPRODUCT_PtAP:
7749       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJBACKEND;
7750       break;
7751     default:
7752       break;
7753     }
7754   }
7755   /* fallback to MPIAIJ ops */
7756   if (!mat->ops->productsymbolic) PetscCall(MatProductSetFromOptions_MPIAIJ(mat));
7757   PetscFunctionReturn(PETSC_SUCCESS);
7758 }
7759 
7760 /*
7761    Produces a set of block column indices of the matrix row, one for each block represented in the original row
7762 
7763    n - the number of block indices in cc[]
7764    cc - the block indices (must be large enough to contain the indices)
7765 */
7766 static inline PetscErrorCode MatCollapseRow(Mat Amat, PetscInt row, PetscInt bs, PetscInt *n, PetscInt *cc)
7767 {
7768   PetscInt        cnt = -1, nidx, j;
7769   const PetscInt *idx;
7770 
7771   PetscFunctionBegin;
7772   PetscCall(MatGetRow(Amat, row, &nidx, &idx, NULL));
7773   if (nidx) {
7774     cnt     = 0;
7775     cc[cnt] = idx[0] / bs;
7776     for (j = 1; j < nidx; j++) {
7777       if (cc[cnt] < idx[j] / bs) cc[++cnt] = idx[j] / bs;
7778     }
7779   }
7780   PetscCall(MatRestoreRow(Amat, row, &nidx, &idx, NULL));
7781   *n = cnt + 1;
7782   PetscFunctionReturn(PETSC_SUCCESS);
7783 }
7784 
7785 /*
7786     Produces a set of block column indices of the matrix block row, one for each block represented in the original set of rows
7787 
7788     ncollapsed - the number of block indices
7789     collapsed - the block indices (must be large enough to contain the indices)
7790 */
7791 static inline PetscErrorCode MatCollapseRows(Mat Amat, PetscInt start, PetscInt bs, PetscInt *w0, PetscInt *w1, PetscInt *w2, PetscInt *ncollapsed, PetscInt **collapsed)
7792 {
7793   PetscInt i, nprev, *cprev = w0, ncur = 0, *ccur = w1, *merged = w2, *cprevtmp;
7794 
7795   PetscFunctionBegin;
7796   PetscCall(MatCollapseRow(Amat, start, bs, &nprev, cprev));
7797   for (i = start + 1; i < start + bs; i++) {
7798     PetscCall(MatCollapseRow(Amat, i, bs, &ncur, ccur));
7799     PetscCall(PetscMergeIntArray(nprev, cprev, ncur, ccur, &nprev, &merged));
7800     cprevtmp = cprev;
7801     cprev    = merged;
7802     merged   = cprevtmp;
7803   }
7804   *ncollapsed = nprev;
7805   if (collapsed) *collapsed = cprev;
7806   PetscFunctionReturn(PETSC_SUCCESS);
7807 }
7808 
7809 /*
7810  MatCreateGraph_Simple_AIJ - create simple scalar matrix (graph) from potentially blocked matrix
7811 
7812  Input Parameter:
7813  . Amat - matrix
7814  - symmetrize - make the result symmetric
7815  + scale - scale with diagonal
7816 
7817  Output Parameter:
7818  . a_Gmat - output scalar graph >= 0
7819 
7820 */
7821 PETSC_INTERN PetscErrorCode MatCreateGraph_Simple_AIJ(Mat Amat, PetscBool symmetrize, PetscBool scale, PetscReal filter, PetscInt index_size, PetscInt index[], Mat *a_Gmat)
7822 {
7823   PetscInt  Istart, Iend, Ii, jj, kk, ncols, nloc, NN, MM, bs;
7824   MPI_Comm  comm;
7825   Mat       Gmat;
7826   PetscBool ismpiaij, isseqaij;
7827   Mat       a, b, c;
7828   MatType   jtype;
7829 
7830   PetscFunctionBegin;
7831   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
7832   PetscCall(MatGetOwnershipRange(Amat, &Istart, &Iend));
7833   PetscCall(MatGetSize(Amat, &MM, &NN));
7834   PetscCall(MatGetBlockSize(Amat, &bs));
7835   nloc = (Iend - Istart) / bs;
7836 
7837   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATSEQAIJ, &isseqaij));
7838   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATMPIAIJ, &ismpiaij));
7839   PetscCheck(isseqaij || ismpiaij, comm, PETSC_ERR_USER, "Require (MPI)AIJ matrix type");
7840 
7841   /* TODO GPU: these calls are potentially expensive if matrices are large and we want to use the GPU */
7842   /* A solution consists in providing a new API, MatAIJGetCollapsedAIJ, and each class can provide a fast
7843      implementation */
7844   if (bs > 1) {
7845     PetscCall(MatGetType(Amat, &jtype));
7846     PetscCall(MatCreate(comm, &Gmat));
7847     PetscCall(MatSetType(Gmat, jtype));
7848     PetscCall(MatSetSizes(Gmat, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
7849     PetscCall(MatSetBlockSizes(Gmat, 1, 1));
7850     if (isseqaij || ((Mat_MPIAIJ *)Amat->data)->garray) {
7851       PetscInt  *d_nnz, *o_nnz;
7852       MatScalar *aa, val, *AA;
7853       PetscInt  *aj, *ai, *AJ, nc, nmax = 0;
7854 
7855       if (isseqaij) {
7856         a = Amat;
7857         b = NULL;
7858       } else {
7859         Mat_MPIAIJ *d = (Mat_MPIAIJ *)Amat->data;
7860         a             = d->A;
7861         b             = d->B;
7862       }
7863       PetscCall(PetscInfo(Amat, "New bs>1 Graph. nloc=%" PetscInt_FMT "\n", nloc));
7864       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7865       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7866         PetscInt       *nnz = (c == a) ? d_nnz : o_nnz;
7867         const PetscInt *cols1, *cols2;
7868 
7869         for (PetscInt brow = 0, nc1, nc2, ok = 1; brow < nloc * bs; brow += bs) { // block rows
7870           PetscCall(MatGetRow(c, brow, &nc2, &cols2, NULL));
7871           nnz[brow / bs] = nc2 / bs;
7872           if (nc2 % bs) ok = 0;
7873           if (nnz[brow / bs] > nmax) nmax = nnz[brow / bs];
7874           for (PetscInt ii = 1; ii < bs; ii++) { // check for non-dense blocks
7875             PetscCall(MatGetRow(c, brow + ii, &nc1, &cols1, NULL));
7876             if (nc1 != nc2) ok = 0;
7877             else {
7878               for (PetscInt jj = 0; jj < nc1 && ok == 1; jj++) {
7879                 if (cols1[jj] != cols2[jj]) ok = 0;
7880                 if (cols1[jj] % bs != jj % bs) ok = 0;
7881               }
7882             }
7883             PetscCall(MatRestoreRow(c, brow + ii, &nc1, &cols1, NULL));
7884           }
7885           PetscCall(MatRestoreRow(c, brow, &nc2, &cols2, NULL));
7886           if (!ok) {
7887             PetscCall(PetscFree2(d_nnz, o_nnz));
7888             PetscCall(PetscInfo(Amat, "Found sparse blocks - revert to slow method\n"));
7889             goto old_bs;
7890           }
7891         }
7892       }
7893       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7894       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7895       PetscCall(PetscFree2(d_nnz, o_nnz));
7896       PetscCall(PetscMalloc2(nmax, &AA, nmax, &AJ));
7897       // diag
7898       for (PetscInt brow = 0, n, grow; brow < nloc * bs; brow += bs) { // block rows
7899         Mat_SeqAIJ *aseq = (Mat_SeqAIJ *)a->data;
7900 
7901         ai = aseq->i;
7902         n  = ai[brow + 1] - ai[brow];
7903         aj = aseq->j + ai[brow];
7904         for (PetscInt k = 0; k < n; k += bs) {   // block columns
7905           AJ[k / bs] = aj[k] / bs + Istart / bs; // diag starts at (Istart,Istart)
7906           val        = 0;
7907           if (index_size == 0) {
7908             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7909               aa = aseq->a + ai[brow + ii] + k;
7910               for (PetscInt jj = 0; jj < bs; jj++) {    // columns in block
7911                 val += PetscAbs(PetscRealPart(aa[jj])); // a sort of norm
7912               }
7913             }
7914           } else {                                            // use (index,index) value if provided
7915             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7916               PetscInt ii = index[iii];
7917               aa          = aseq->a + ai[brow + ii] + k;
7918               for (PetscInt jjj = 0; jjj < index_size; jjj++) { // columns in block
7919                 PetscInt jj = index[jjj];
7920                 val += PetscAbs(PetscRealPart(aa[jj]));
7921               }
7922             }
7923           }
7924           PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7925           AA[k / bs] = val;
7926         }
7927         grow = Istart / bs + brow / bs;
7928         PetscCall(MatSetValues(Gmat, 1, &grow, n / bs, AJ, AA, ADD_VALUES));
7929       }
7930       // off-diag
7931       if (ismpiaij) {
7932         Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)Amat->data;
7933         const PetscScalar *vals;
7934         const PetscInt    *cols, *garray = aij->garray;
7935 
7936         PetscCheck(garray, PETSC_COMM_SELF, PETSC_ERR_USER, "No garray ?");
7937         for (PetscInt brow = 0, grow; brow < nloc * bs; brow += bs) { // block rows
7938           PetscCall(MatGetRow(b, brow, &ncols, &cols, NULL));
7939           for (PetscInt k = 0, cidx = 0; k < ncols; k += bs, cidx++) {
7940             PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs >= nmax");
7941             AA[k / bs] = 0;
7942             AJ[cidx]   = garray[cols[k]] / bs;
7943           }
7944           nc = ncols / bs;
7945           PetscCall(MatRestoreRow(b, brow, &ncols, &cols, NULL));
7946           if (index_size == 0) {
7947             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7948               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7949               for (PetscInt k = 0; k < ncols; k += bs) {
7950                 for (PetscInt jj = 0; jj < bs; jj++) { // cols in block
7951                   PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7952                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7953                 }
7954               }
7955               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7956             }
7957           } else {                                            // use (index,index) value if provided
7958             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7959               PetscInt ii = index[iii];
7960               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7961               for (PetscInt k = 0; k < ncols; k += bs) {
7962                 for (PetscInt jjj = 0; jjj < index_size; jjj++) { // cols in block
7963                   PetscInt jj = index[jjj];
7964                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7965                 }
7966               }
7967               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7968             }
7969           }
7970           grow = Istart / bs + brow / bs;
7971           PetscCall(MatSetValues(Gmat, 1, &grow, nc, AJ, AA, ADD_VALUES));
7972         }
7973       }
7974       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7975       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7976       PetscCall(PetscFree2(AA, AJ));
7977     } else {
7978       const PetscScalar *vals;
7979       const PetscInt    *idx;
7980       PetscInt          *d_nnz, *o_nnz, *w0, *w1, *w2;
7981     old_bs:
7982       /*
7983        Determine the preallocation needed for the scalar matrix derived from the vector matrix.
7984        */
7985       PetscCall(PetscInfo(Amat, "OLD bs>1 CreateGraph\n"));
7986       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7987       if (isseqaij) {
7988         PetscInt max_d_nnz;
7989 
7990         /*
7991          Determine exact preallocation count for (sequential) scalar matrix
7992          */
7993         PetscCall(MatSeqAIJGetMaxRowNonzeros(Amat, &max_d_nnz));
7994         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7995         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7996         for (Ii = 0, jj = 0; Ii < Iend; Ii += bs, jj++) PetscCall(MatCollapseRows(Amat, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7997         PetscCall(PetscFree3(w0, w1, w2));
7998       } else if (ismpiaij) {
7999         Mat             Daij, Oaij;
8000         const PetscInt *garray;
8001         PetscInt        max_d_nnz;
8002 
8003         PetscCall(MatMPIAIJGetSeqAIJ(Amat, &Daij, &Oaij, &garray));
8004         /*
8005          Determine exact preallocation count for diagonal block portion of scalar matrix
8006          */
8007         PetscCall(MatSeqAIJGetMaxRowNonzeros(Daij, &max_d_nnz));
8008         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
8009         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
8010         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) PetscCall(MatCollapseRows(Daij, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
8011         PetscCall(PetscFree3(w0, w1, w2));
8012         /*
8013          Over estimate (usually grossly over), preallocation count for off-diagonal portion of scalar matrix
8014          */
8015         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) {
8016           o_nnz[jj] = 0;
8017           for (kk = 0; kk < bs; kk++) { /* rows that get collapsed to a single row */
8018             PetscCall(MatGetRow(Oaij, Ii + kk, &ncols, NULL, NULL));
8019             o_nnz[jj] += ncols;
8020             PetscCall(MatRestoreRow(Oaij, Ii + kk, &ncols, NULL, NULL));
8021           }
8022           if (o_nnz[jj] > (NN / bs - nloc)) o_nnz[jj] = NN / bs - nloc;
8023         }
8024       } else SETERRQ(comm, PETSC_ERR_USER, "Require AIJ matrix type");
8025       /* get scalar copy (norms) of matrix */
8026       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
8027       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
8028       PetscCall(PetscFree2(d_nnz, o_nnz));
8029       for (Ii = Istart; Ii < Iend; Ii++) {
8030         PetscInt dest_row = Ii / bs;
8031 
8032         PetscCall(MatGetRow(Amat, Ii, &ncols, &idx, &vals));
8033         for (jj = 0; jj < ncols; jj++) {
8034           PetscInt    dest_col = idx[jj] / bs;
8035           PetscScalar sv       = PetscAbs(PetscRealPart(vals[jj]));
8036 
8037           PetscCall(MatSetValues(Gmat, 1, &dest_row, 1, &dest_col, &sv, ADD_VALUES));
8038         }
8039         PetscCall(MatRestoreRow(Amat, Ii, &ncols, &idx, &vals));
8040       }
8041       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
8042       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
8043     }
8044   } else {
8045     if (symmetrize || filter >= 0 || scale) PetscCall(MatDuplicate(Amat, MAT_COPY_VALUES, &Gmat));
8046     else {
8047       Gmat = Amat;
8048       PetscCall(PetscObjectReference((PetscObject)Gmat));
8049     }
8050     if (isseqaij) {
8051       a = Gmat;
8052       b = NULL;
8053     } else {
8054       Mat_MPIAIJ *d = (Mat_MPIAIJ *)Gmat->data;
8055       a             = d->A;
8056       b             = d->B;
8057     }
8058     if (filter >= 0 || scale) {
8059       /* take absolute value of each entry */
8060       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
8061         MatInfo      info;
8062         PetscScalar *avals;
8063 
8064         PetscCall(MatGetInfo(c, MAT_LOCAL, &info));
8065         PetscCall(MatSeqAIJGetArray(c, &avals));
8066         for (int jj = 0; jj < info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
8067         PetscCall(MatSeqAIJRestoreArray(c, &avals));
8068       }
8069     }
8070   }
8071   if (symmetrize) {
8072     PetscBool isset, issym;
8073 
8074     PetscCall(MatIsSymmetricKnown(Amat, &isset, &issym));
8075     if (!isset || !issym) {
8076       Mat matTrans;
8077 
8078       PetscCall(MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans));
8079       PetscCall(MatAXPY(Gmat, 1.0, matTrans, Gmat->structurally_symmetric == PETSC_BOOL3_TRUE ? SAME_NONZERO_PATTERN : DIFFERENT_NONZERO_PATTERN));
8080       PetscCall(MatDestroy(&matTrans));
8081     }
8082     PetscCall(MatSetOption(Gmat, MAT_SYMMETRIC, PETSC_TRUE));
8083   } else if (Amat != Gmat) PetscCall(MatPropagateSymmetryOptions(Amat, Gmat));
8084   if (scale) {
8085     /* scale c for all diagonal values = 1 or -1 */
8086     Vec diag;
8087 
8088     PetscCall(MatCreateVecs(Gmat, &diag, NULL));
8089     PetscCall(MatGetDiagonal(Gmat, diag));
8090     PetscCall(VecReciprocal(diag));
8091     PetscCall(VecSqrtAbs(diag));
8092     PetscCall(MatDiagonalScale(Gmat, diag, diag));
8093     PetscCall(VecDestroy(&diag));
8094   }
8095   PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_graph_view"));
8096   if (filter >= 0) {
8097     PetscCall(MatFilter(Gmat, filter, PETSC_TRUE, PETSC_TRUE));
8098     PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_filter_graph_view"));
8099   }
8100   *a_Gmat = Gmat;
8101   PetscFunctionReturn(PETSC_SUCCESS);
8102 }
8103 
8104 /*
8105     Special version for direct calls from Fortran
8106 */
8107 
8108 /* Change these macros so can be used in void function */
8109 /* Identical to PetscCallVoid, except it assigns to *_ierr */
8110 #undef PetscCall
8111 #define PetscCall(...) \
8112   do { \
8113     PetscErrorCode ierr_msv_mpiaij = __VA_ARGS__; \
8114     if (PetscUnlikely(ierr_msv_mpiaij)) { \
8115       *_ierr = PetscError(PETSC_COMM_SELF, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr_msv_mpiaij, PETSC_ERROR_REPEAT, " "); \
8116       return; \
8117     } \
8118   } while (0)
8119 
8120 #undef SETERRQ
8121 #define SETERRQ(comm, ierr, ...) \
8122   do { \
8123     *_ierr = PetscError(comm, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr, PETSC_ERROR_INITIAL, __VA_ARGS__); \
8124     return; \
8125   } while (0)
8126 
8127 #if defined(PETSC_HAVE_FORTRAN_CAPS)
8128   #define matsetvaluesmpiaij_ MATSETVALUESMPIAIJ
8129 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
8130   #define matsetvaluesmpiaij_ matsetvaluesmpiaij
8131 #else
8132 #endif
8133 PETSC_EXTERN void matsetvaluesmpiaij_(Mat *mmat, PetscInt *mm, const PetscInt im[], PetscInt *mn, const PetscInt in[], const PetscScalar v[], InsertMode *maddv, PetscErrorCode *_ierr)
8134 {
8135   Mat         mat = *mmat;
8136   PetscInt    m = *mm, n = *mn;
8137   InsertMode  addv = *maddv;
8138   Mat_MPIAIJ *aij  = (Mat_MPIAIJ *)mat->data;
8139   PetscScalar value;
8140 
8141   MatCheckPreallocated(mat, 1);
8142   if (mat->insertmode == NOT_SET_VALUES) mat->insertmode = addv;
8143   else PetscCheck(mat->insertmode == addv, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot mix add values and insert values");
8144   {
8145     PetscInt  i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
8146     PetscInt  cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
8147     PetscBool roworiented = aij->roworiented;
8148 
8149     /* Some Variables required in the macro */
8150     Mat         A     = aij->A;
8151     Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
8152     PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
8153     MatScalar  *aa;
8154     PetscBool   ignorezeroentries = ((a->ignorezeroentries && (addv == ADD_VALUES)) ? PETSC_TRUE : PETSC_FALSE);
8155     Mat         B                 = aij->B;
8156     Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
8157     PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
8158     MatScalar  *ba;
8159     /* This variable below is only for the PETSC_HAVE_VIENNACL or PETSC_HAVE_CUDA cases, but we define it in all cases because we
8160      * cannot use "#if defined" inside a macro. */
8161     PETSC_UNUSED PetscBool inserted = PETSC_FALSE;
8162 
8163     PetscInt  *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
8164     PetscInt   nonew = a->nonew;
8165     MatScalar *ap1, *ap2;
8166 
8167     PetscFunctionBegin;
8168     PetscCall(MatSeqAIJGetArray(A, &aa));
8169     PetscCall(MatSeqAIJGetArray(B, &ba));
8170     for (i = 0; i < m; i++) {
8171       if (im[i] < 0) continue;
8172       PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
8173       if (im[i] >= rstart && im[i] < rend) {
8174         row      = im[i] - rstart;
8175         lastcol1 = -1;
8176         rp1      = aj + ai[row];
8177         ap1      = aa + ai[row];
8178         rmax1    = aimax[row];
8179         nrow1    = ailen[row];
8180         low1     = 0;
8181         high1    = nrow1;
8182         lastcol2 = -1;
8183         rp2      = bj + bi[row];
8184         ap2      = ba + bi[row];
8185         rmax2    = bimax[row];
8186         nrow2    = bilen[row];
8187         low2     = 0;
8188         high2    = nrow2;
8189 
8190         for (j = 0; j < n; j++) {
8191           if (roworiented) value = v[i * n + j];
8192           else value = v[i + j * m];
8193           if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
8194           if (in[j] >= cstart && in[j] < cend) {
8195             col = in[j] - cstart;
8196             MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
8197           } else if (in[j] < 0) continue;
8198           else if (PetscUnlikelyDebug(in[j] >= mat->cmap->N)) {
8199             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
8200           } else {
8201             if (mat->was_assembled) {
8202               if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
8203 #if defined(PETSC_USE_CTABLE)
8204               PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col));
8205               col--;
8206 #else
8207               col = aij->colmap[in[j]] - 1;
8208 #endif
8209               if (col < 0 && !((Mat_SeqAIJ *)aij->A->data)->nonew) {
8210                 PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
8211                 col = in[j];
8212                 /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
8213                 B        = aij->B;
8214                 b        = (Mat_SeqAIJ *)B->data;
8215                 bimax    = b->imax;
8216                 bi       = b->i;
8217                 bilen    = b->ilen;
8218                 bj       = b->j;
8219                 rp2      = bj + bi[row];
8220                 ap2      = ba + bi[row];
8221                 rmax2    = bimax[row];
8222                 nrow2    = bilen[row];
8223                 low2     = 0;
8224                 high2    = nrow2;
8225                 bm       = aij->B->rmap->n;
8226                 ba       = b->a;
8227                 inserted = PETSC_FALSE;
8228               }
8229             } else col = in[j];
8230             MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
8231           }
8232         }
8233       } else if (!aij->donotstash) {
8234         if (roworiented) {
8235           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v + i * n, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8236         } else {
8237           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v + i, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8238         }
8239       }
8240     }
8241     PetscCall(MatSeqAIJRestoreArray(A, &aa));
8242     PetscCall(MatSeqAIJRestoreArray(B, &ba));
8243   }
8244   PetscFunctionReturnVoid();
8245 }
8246 
8247 /* Undefining these here since they were redefined from their original definition above! No
8248  * other PETSc functions should be defined past this point, as it is impossible to recover the
8249  * original definitions */
8250 #undef PetscCall
8251 #undef SETERRQ
8252