xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 6497c311e7b976d467be1503c1effce92a60525c)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 /*
22    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
23 
24    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25    that is a very major recoding job.
26 
27    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28 */
29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30 {
31   PetscFunctionBegin;
32   for (PetscInt i = 0; i < imax; i++) {
33     if (!is[i]) break;
34     PetscInt        n = 0, N, Nmax, Nmin;
35     const PetscInt *idx;
36     PetscInt       *nidx = NULL;
37     MPI_Comm        comm;
38     PetscBT         bt;
39 
40     PetscCall(ISGetLocalSize(is[i], &N));
41     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42       PetscCall(ISGetIndices(is[i], &idx));
43       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44       Nmin = Nmin / bs;
45       Nmax = Nmax / bs;
46       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47       for (PetscInt j = 0; j < N; j++) {
48         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49       }
50       PetscCall(PetscMalloc1(n, &nidx));
51       n = 0;
52       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53       for (PetscInt j = 0; j < N; j++) {
54         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55       }
56       PetscCall(PetscBTDestroy(&bt));
57       PetscCall(ISRestoreIndices(is[i], &idx));
58     }
59     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60     PetscCall(ISDestroy(is + i));
61     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67 {
68   PetscInt i;
69 
70   PetscFunctionBegin;
71   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74   PetscFunctionReturn(PETSC_SUCCESS);
75 }
76 
77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78 {
79   PetscInt i;
80 
81   PetscFunctionBegin;
82   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89 {
90   MPI_Comm        comm;
91   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_isids, j;
92   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94   const PetscInt *indices_i, **indices;
95   PetscLayout     rmap;
96   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner, *rrow_ranks;
97   PetscSF         sf;
98   PetscSFNode    *remote;
99 
100   PetscFunctionBegin;
101   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103   PetscCallMPI(MPI_Comm_size(comm, &size));
104   /* get row map to determine where rows should be going */
105   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106   /* retrieve IS data and put all together so that we
107    * can optimize communication
108    *  */
109   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110   tlength = 0;
111   for (PetscInt i = 0; i < nidx; i++) {
112     PetscCall(ISGetLocalSize(is[i], &length[i]));
113     tlength += length[i];
114     PetscCall(ISGetIndices(is[i], &indices[i]));
115   }
116   /* find these rows on remote processors */
117   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
118   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
119   nrrows = 0;
120   for (PetscInt i = 0; i < nidx; i++) {
121     length_i  = length[i];
122     indices_i = indices[i];
123     for (PetscInt j = 0; j < length_i; j++) {
124       owner = -1;
125       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
126       /* remote processors */
127       if (owner != rank) {
128         tosizes_temp[owner]++;               /* number of rows to owner */
129         rrow_ranks[nrrows]   = owner;        /* processor */
130         rrow_isids[nrrows]   = i;            /* is id */
131         remoterows[nrrows++] = indices_i[j]; /* row */
132       }
133     }
134     PetscCall(ISRestoreIndices(is[i], &indices[i]));
135   }
136   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
137   /* test if we need to exchange messages
138    * generally speaking, we do not need to exchange
139    * data when overlap is 1
140    * */
141   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
142   /* we do not have any messages
143    * It usually corresponds to overlap 1
144    * */
145   if (!reducednrrows) {
146     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
147     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
148     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
149     PetscFunctionReturn(PETSC_SUCCESS);
150   }
151   nto = 0;
152   /* send sizes and ranks for building a two-sided communication */
153   for (PetscMPIInt i = 0; i < size; i++) {
154     if (tosizes_temp[i]) {
155       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
156       tosizes_temp[i]  = nto;                 /* a map from processor to index */
157       toranks[nto++]   = i;                   /* MPI process */
158     }
159   }
160   PetscCall(PetscMalloc1(nto + 1, &toffsets));
161   toffsets[0] = 0;
162   for (PetscInt i = 0; i < nto; i++) {
163     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
164     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
165   }
166   /* send information to other processors */
167   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
168   nrecvrows = 0;
169   for (PetscMPIInt i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
170   PetscCall(PetscMalloc1(nrecvrows, &remote));
171   nrecvrows = 0;
172   for (PetscMPIInt i = 0; i < nfrom; i++) {
173     for (PetscInt j = 0; j < fromsizes[2 * i]; j++) {
174       remote[nrecvrows].rank    = fromranks[i];
175       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
176     }
177   }
178   PetscCall(PetscSFCreate(comm, &sf));
179   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
180   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
181   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
182   PetscCall(PetscSFSetFromOptions(sf));
183   /* message pair <no of is, row>  */
184   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
185   for (PetscInt i = 0; i < nrrows; i++) {
186     owner                 = rrow_ranks[i];       /* process */
187     j                     = tosizes_temp[owner]; /* index */
188     todata[toffsets[j]++] = rrow_isids[i];
189     todata[toffsets[j]++] = remoterows[i];
190   }
191   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
192   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
193   PetscCall(PetscFree(toffsets));
194   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
196   PetscCall(PetscSFDestroy(&sf));
197   /* send rows belonging to the remote so that then we could get the overlapping data back */
198   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
199   PetscCall(PetscFree2(todata, fromdata));
200   PetscCall(PetscFree(fromsizes));
201   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
202   PetscCall(PetscFree(fromranks));
203   nrecvrows = 0;
204   for (PetscInt i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
205   PetscCall(PetscCalloc1(nrecvrows, &todata));
206   PetscCall(PetscMalloc1(nrecvrows, &remote));
207   nrecvrows = 0;
208   for (PetscInt i = 0; i < nto; i++) {
209     for (PetscInt j = 0; j < tosizes[2 * i]; j++) {
210       remote[nrecvrows].rank    = toranks[i];
211       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
212     }
213   }
214   PetscCall(PetscSFCreate(comm, &sf));
215   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
216   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
217   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
218   PetscCall(PetscSFSetFromOptions(sf));
219   /* overlap communication and computation */
220   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
221   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
222   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
223   PetscCall(PetscSFDestroy(&sf));
224   PetscCall(PetscFree2(sbdata, sbsizes));
225   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
226   PetscCall(PetscFree(toranks));
227   PetscCall(PetscFree(tosizes));
228   PetscCall(PetscFree(todata));
229   PetscFunctionReturn(PETSC_SUCCESS);
230 }
231 
232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
233 {
234   PetscInt       *isz, isz_i, i, j, is_id, data_size;
235   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
236   const PetscInt *indices_i_temp;
237   MPI_Comm       *iscomms;
238 
239   PetscFunctionBegin;
240   max_lsize = 0;
241   PetscCall(PetscMalloc1(nidx, &isz));
242   for (i = 0; i < nidx; i++) {
243     PetscCall(ISGetLocalSize(is[i], &lsize));
244     max_lsize = lsize > max_lsize ? lsize : max_lsize;
245     isz[i]    = lsize;
246   }
247   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
248   for (i = 0; i < nidx; i++) {
249     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
250     PetscCall(ISGetIndices(is[i], &indices_i_temp));
251     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i]));
252     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
253     PetscCall(ISDestroy(&is[i]));
254   }
255   /* retrieve information to get row id and its overlap */
256   for (i = 0; i < nrecvs;) {
257     is_id     = recvdata[i++];
258     data_size = recvdata[i++];
259     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
260     isz_i     = isz[is_id];
261     for (j = 0; j < data_size; j++) {
262       col                = recvdata[i++];
263       indices_i[isz_i++] = col;
264     }
265     isz[is_id] = isz_i;
266   }
267   /* remove duplicate entities */
268   for (i = 0; i < nidx; i++) {
269     indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i);
270     isz_i     = isz[i];
271     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
272     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
273     PetscCall(PetscCommDestroy(&iscomms[i]));
274   }
275   PetscCall(PetscFree(isz));
276   PetscCall(PetscFree2(indices_temp, iscomms));
277   PetscFunctionReturn(PETSC_SUCCESS);
278 }
279 
280 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
281 {
282   PetscLayout     rmap, cmap;
283   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
284   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
285   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
286   const PetscInt *gcols, *ai, *aj, *bi, *bj;
287   Mat             amat, bmat;
288   PetscMPIInt     rank;
289   PetscBool       done;
290   MPI_Comm        comm;
291 
292   PetscFunctionBegin;
293   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
294   PetscCallMPI(MPI_Comm_rank(comm, &rank));
295   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
296   /* Even if the mat is symmetric, we still assume it is not symmetric */
297   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
298   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
299   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
300   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
301   /* total number of nonzero values is used to estimate the memory usage in the next step */
302   tnz = ai[an] + bi[bn];
303   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
304   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
305   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
306   /* to find the longest message */
307   max_fszs = 0;
308   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
309   /* better way to estimate number of nonzero in the mat??? */
310   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
311   for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i);
312   rows_pos  = 0;
313   totalrows = 0;
314   for (i = 0; i < nfrom; i++) {
315     PetscCall(PetscArrayzero(rows_pos_i, nidx));
316     /* group data together */
317     for (j = 0; j < fromsizes[2 * i]; j += 2) {
318       is_id                       = fromrows[rows_pos++]; /* no of is */
319       rows_i                      = rows_data[is_id];
320       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
321     }
322     /* estimate a space to avoid multiple allocations  */
323     for (j = 0; j < nidx; j++) {
324       indvc_ij = 0;
325       rows_i   = rows_data[j];
326       for (l = 0; l < rows_pos_i[j]; l++) {
327         row   = rows_i[l] - rstart;
328         start = ai[row];
329         end   = ai[row + 1];
330         for (k = start; k < end; k++) { /* Amat */
331           col                     = aj[k] + cstart;
332           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
333         }
334         start = bi[row];
335         end   = bi[row + 1];
336         for (k = start; k < end; k++) { /* Bmat */
337           col                     = gcols[bj[k]];
338           indices_tmp[indvc_ij++] = col;
339         }
340       }
341       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
342       indv_counts[i * nidx + j] = indvc_ij;
343       totalrows += indvc_ij;
344     }
345   }
346   /* message triple <no of is, number of rows, rows> */
347   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
348   totalrows = 0;
349   rows_pos  = 0;
350   /* use this code again */
351   for (i = 0; i < nfrom; i++) {
352     PetscCall(PetscArrayzero(rows_pos_i, nidx));
353     for (j = 0; j < fromsizes[2 * i]; j += 2) {
354       is_id                       = fromrows[rows_pos++];
355       rows_i                      = rows_data[is_id];
356       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
357     }
358     /* add data  */
359     for (j = 0; j < nidx; j++) {
360       if (!indv_counts[i * nidx + j]) continue;
361       indvc_ij            = 0;
362       sbdata[totalrows++] = j;
363       sbdata[totalrows++] = indv_counts[i * nidx + j];
364       sbsizes[2 * i] += 2;
365       rows_i = rows_data[j];
366       for (l = 0; l < rows_pos_i[j]; l++) {
367         row   = rows_i[l] - rstart;
368         start = ai[row];
369         end   = ai[row + 1];
370         for (k = start; k < end; k++) { /* Amat */
371           col                     = aj[k] + cstart;
372           indices_tmp[indvc_ij++] = col;
373         }
374         start = bi[row];
375         end   = bi[row + 1];
376         for (k = start; k < end; k++) { /* Bmat */
377           col                     = gcols[bj[k]];
378           indices_tmp[indvc_ij++] = col;
379         }
380       }
381       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
382       sbsizes[2 * i] += indvc_ij;
383       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
384       totalrows += indvc_ij;
385     }
386   }
387   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
388   offsets[0] = 0;
389   for (i = 0; i < nfrom; i++) {
390     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
391     sbsizes[2 * i + 1] = offsets[i];
392   }
393   PetscCall(PetscFree(offsets));
394   if (sbrowsizes) *sbrowsizes = sbsizes;
395   if (sbrows) *sbrows = sbdata;
396   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
397   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
398   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
403 {
404   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
405   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
406   PetscInt        lsize, lsize_tmp;
407   PetscMPIInt     rank, owner;
408   Mat             amat, bmat;
409   PetscBool       done;
410   PetscLayout     cmap, rmap;
411   MPI_Comm        comm;
412 
413   PetscFunctionBegin;
414   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
415   PetscCallMPI(MPI_Comm_rank(comm, &rank));
416   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
417   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
418   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
419   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
420   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
421   /* is it a safe way to compute number of nonzero values ? */
422   tnz = ai[an] + bi[bn];
423   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
424   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
425   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
426   /* it is a better way to estimate memory than the old implementation
427    * where global size of matrix is used
428    * */
429   PetscCall(PetscMalloc1(tnz, &indices_temp));
430   for (i = 0; i < nidx; i++) {
431     MPI_Comm iscomm;
432 
433     PetscCall(ISGetLocalSize(is[i], &lsize));
434     PetscCall(ISGetIndices(is[i], &indices));
435     lsize_tmp = 0;
436     for (j = 0; j < lsize; j++) {
437       owner = -1;
438       row   = indices[j];
439       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
440       if (owner != rank) continue;
441       /* local number */
442       row -= rstart;
443       start = ai[row];
444       end   = ai[row + 1];
445       for (k = start; k < end; k++) { /* Amat */
446         col                       = aj[k] + cstart;
447         indices_temp[lsize_tmp++] = col;
448       }
449       start = bi[row];
450       end   = bi[row + 1];
451       for (k = start; k < end; k++) { /* Bmat */
452         col                       = gcols[bj[k]];
453         indices_temp[lsize_tmp++] = col;
454       }
455     }
456     PetscCall(ISRestoreIndices(is[i], &indices));
457     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
458     PetscCall(ISDestroy(&is[i]));
459     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
460     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
461     PetscCall(PetscCommDestroy(&iscomm));
462   }
463   PetscCall(PetscFree(indices_temp));
464   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
465   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
466   PetscFunctionReturn(PETSC_SUCCESS);
467 }
468 
469 /*
470   Sample message format:
471   If a processor A wants processor B to process some elements corresponding
472   to index sets is[1],is[5]
473   mesg [0] = 2   (no of index sets in the mesg)
474   -----------
475   mesg [1] = 1 => is[1]
476   mesg [2] = sizeof(is[1]);
477   -----------
478   mesg [3] = 5  => is[5]
479   mesg [4] = sizeof(is[5]);
480   -----------
481   mesg [5]
482   mesg [n]  datas[1]
483   -----------
484   mesg[n+1]
485   mesg[m]  data(is[5])
486   -----------
487 
488   Notes:
489   nrqs - no of requests sent (or to be sent out)
490   nrqr - no of requests received (which have to be or which have been processed)
491 */
492 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
493 {
494   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
495   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
496   const PetscInt **idx, *idx_i;
497   PetscInt        *n, **data, len;
498 #if defined(PETSC_USE_CTABLE)
499   PetscHMapI *table_data, table_data_i;
500   PetscInt   *tdata, tcount, tcount_max;
501 #else
502   PetscInt *data_i, *d_p;
503 #endif
504   PetscMPIInt  size, rank, tag1, tag2, proc = 0, nrqs, *pa;
505   PetscInt     M, k, **rbuf, row, msz, **outdat, **ptr, *isz1;
506   PetscInt    *ctr, *tmp, *isz, **xdata, **rbuf2;
507   PetscBT     *table;
508   MPI_Comm     comm;
509   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
510   MPI_Status  *recv_status;
511   MPI_Comm    *iscomms;
512   char        *t_p;
513 
514   PetscFunctionBegin;
515   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
516   size = c->size;
517   rank = c->rank;
518   M    = C->rmap->N;
519 
520   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
521   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
522 
523   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
524 
525   for (PetscInt i = 0; i < imax; i++) {
526     PetscCall(ISGetIndices(is[i], &idx[i]));
527     PetscCall(ISGetLocalSize(is[i], &n[i]));
528   }
529 
530   /* evaluate communication - mesg to who,length of mesg, and buffer space
531      required. Based on this, buffers are allocated, and data copied into them  */
532   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
533   for (PetscInt i = 0; i < imax; i++) {
534     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
535     idx_i = idx[i];
536     len   = n[i];
537     for (PetscInt j = 0; j < len; j++) {
538       row = idx_i[j];
539       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
540       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
541       w4[proc]++;
542     }
543     for (PetscMPIInt j = 0; j < size; j++) {
544       if (w4[j]) {
545         w1[j] += w4[j];
546         w3[j]++;
547       }
548     }
549   }
550 
551   nrqs     = 0; /* no of outgoing messages */
552   msz      = 0; /* total mesg length (for all proc */
553   w1[rank] = 0; /* no mesg sent to intself */
554   w3[rank] = 0;
555   for (PetscMPIInt i = 0; i < size; i++) {
556     if (w1[i]) {
557       w2[i] = 1;
558       nrqs++;
559     } /* there exists a message to proc i */
560   }
561   /* pa - is list of processors to communicate with */
562   PetscCall(PetscMalloc1(nrqs, &pa));
563   for (PetscMPIInt i = 0, j = 0; i < size; i++) {
564     if (w1[i]) {
565       pa[j] = i;
566       j++;
567     }
568   }
569 
570   /* Each message would have a header = 1 + 2*(no of IS) + data */
571   for (PetscMPIInt i = 0; i < nrqs; i++) {
572     PetscMPIInt j = pa[i];
573     w1[j] += w2[j] + 2 * w3[j];
574     msz += w1[j];
575   }
576 
577   /* Determine the number of messages to expect, their lengths, from from-ids */
578   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
579   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
580 
581   /* Now post the Irecvs corresponding to these messages */
582   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
583 
584   /* Allocate Memory for outgoing messages */
585   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
586   PetscCall(PetscArrayzero(outdat, size));
587   PetscCall(PetscArrayzero(ptr, size));
588 
589   {
590     PetscInt *iptr = tmp, ict = 0;
591     for (PetscMPIInt i = 0; i < nrqs; i++) {
592       PetscMPIInt j = pa[i];
593       iptr += ict;
594       outdat[j] = iptr;
595       ict       = w1[j];
596     }
597   }
598 
599   /* Form the outgoing messages */
600   /* plug in the headers */
601   for (PetscMPIInt i = 0; i < nrqs; i++) {
602     PetscMPIInt j = pa[i];
603     outdat[j][0]  = 0;
604     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
605     ptr[j] = outdat[j] + 2 * w3[j] + 1;
606   }
607 
608   /* Memory for doing local proc's work */
609   {
610     PetscInt M_BPB_imax = 0;
611 #if defined(PETSC_USE_CTABLE)
612     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
613     PetscCall(PetscMalloc1(imax, &table_data));
614     for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
615     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
616     for (PetscInt i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
617 #else
618     PetscInt Mimax = 0;
619     PetscCall(PetscIntMultError(M, imax, &Mimax));
620     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
621     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
622     for (PetscInt i = 0; i < imax; i++) {
623       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
624       data[i]  = d_p + M * i;
625     }
626 #endif
627   }
628 
629   /* Parse the IS and update local tables and the outgoing buf with the data */
630   {
631     PetscInt n_i, isz_i, *outdat_j, ctr_j;
632     PetscBT  table_i;
633 
634     for (PetscInt i = 0; i < imax; i++) {
635       PetscCall(PetscArrayzero(ctr, size));
636       n_i     = n[i];
637       table_i = table[i];
638       idx_i   = idx[i];
639 #if defined(PETSC_USE_CTABLE)
640       table_data_i = table_data[i];
641 #else
642       data_i = data[i];
643 #endif
644       isz_i = isz[i];
645       for (PetscInt j = 0; j < n_i; j++) { /* parse the indices of each IS */
646         row = idx_i[j];
647         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
648         if (proc != rank) { /* copy to the outgoing buffer */
649           ctr[proc]++;
650           *ptr[proc] = row;
651           ptr[proc]++;
652         } else if (!PetscBTLookupSet(table_i, row)) {
653 #if defined(PETSC_USE_CTABLE)
654           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
655 #else
656           data_i[isz_i] = row; /* Update the local table */
657 #endif
658           isz_i++;
659         }
660       }
661       /* Update the headers for the current IS */
662       for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
663         if ((ctr_j = ctr[j])) {
664           outdat_j            = outdat[j];
665           k                   = ++outdat_j[0];
666           outdat_j[2 * k]     = ctr_j;
667           outdat_j[2 * k - 1] = i;
668         }
669       }
670       isz[i] = isz_i;
671     }
672   }
673 
674   /*  Now  post the sends */
675   PetscCall(PetscMalloc1(nrqs, &s_waits1));
676   for (PetscMPIInt i = 0; i < nrqs; ++i) {
677     PetscMPIInt j = pa[i];
678     PetscCallMPI(MPIU_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
679   }
680 
681   /* No longer need the original indices */
682   PetscCall(PetscMalloc1(imax, &iscomms));
683   for (PetscInt i = 0; i < imax; ++i) {
684     PetscCall(ISRestoreIndices(is[i], idx + i));
685     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
686   }
687   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
688 
689   for (PetscInt i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
690 
691     /* Do Local work */
692 #if defined(PETSC_USE_CTABLE)
693   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
694 #else
695   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
696 #endif
697 
698   /* Receive messages */
699   PetscCall(PetscMalloc1(nrqr, &recv_status));
700   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
701   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
702 
703   /* Phase 1 sends are complete - deallocate buffers */
704   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
705   PetscCall(PetscFree4(w1, w2, w3, w4));
706 
707   PetscCall(PetscMalloc1(nrqr, &xdata));
708   PetscCall(PetscMalloc1(nrqr, &isz1));
709   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
710   PetscCall(PetscFree(rbuf[0]));
711   PetscCall(PetscFree(rbuf));
712 
713   /* Send the data back */
714   /* Do a global reduction to know the buffer space req for incoming messages */
715   {
716     PetscMPIInt *rw1;
717 
718     PetscCall(PetscCalloc1(size, &rw1));
719     for (PetscMPIInt i = 0; i < nrqr; ++i) {
720       proc = recv_status[i].MPI_SOURCE;
721       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
722       PetscCall(PetscMPIIntCast(isz1[i], &rw1[proc]));
723     }
724     PetscCall(PetscFree(onodes1));
725     PetscCall(PetscFree(olengths1));
726 
727     /* Determine the number of messages to expect, their lengths, from from-ids */
728     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
729     PetscCall(PetscFree(rw1));
730   }
731   /* Now post the Irecvs corresponding to these messages */
732   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
733 
734   /* Now  post the sends */
735   PetscCall(PetscMalloc1(nrqr, &s_waits2));
736   for (PetscMPIInt i = 0; i < nrqr; ++i) { PetscCallMPI(MPIU_Isend(xdata[i], isz1[i], MPIU_INT, recv_status[i].MPI_SOURCE, tag2, comm, s_waits2 + i)); }
737 
738   /* receive work done on other processors */
739   {
740     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
741     PetscMPIInt idex;
742     PetscBT     table_i;
743 
744     for (PetscMPIInt i = 0; i < nrqs; ++i) {
745       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
746       /* Process the message */
747       rbuf2_i = rbuf2[idex];
748       ct1     = 2 * rbuf2_i[0] + 1;
749       jmax    = rbuf2[idex][0];
750       for (PetscInt j = 1; j <= jmax; j++) {
751         max     = rbuf2_i[2 * j];
752         is_no   = rbuf2_i[2 * j - 1];
753         isz_i   = isz[is_no];
754         table_i = table[is_no];
755 #if defined(PETSC_USE_CTABLE)
756         table_data_i = table_data[is_no];
757 #else
758         data_i = data[is_no];
759 #endif
760         for (k = 0; k < max; k++, ct1++) {
761           row = rbuf2_i[ct1];
762           if (!PetscBTLookupSet(table_i, row)) {
763 #if defined(PETSC_USE_CTABLE)
764             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
765 #else
766             data_i[isz_i] = row;
767 #endif
768             isz_i++;
769           }
770         }
771         isz[is_no] = isz_i;
772       }
773     }
774 
775     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
776   }
777 
778 #if defined(PETSC_USE_CTABLE)
779   tcount_max = 0;
780   for (PetscInt i = 0; i < imax; ++i) {
781     table_data_i = table_data[i];
782     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
783     if (tcount_max < tcount) tcount_max = tcount;
784   }
785   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
786 #endif
787 
788   for (PetscInt i = 0; i < imax; ++i) {
789 #if defined(PETSC_USE_CTABLE)
790     PetscHashIter tpos;
791     PetscInt      j;
792 
793     table_data_i = table_data[i];
794     PetscHashIterBegin(table_data_i, tpos);
795     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
796       PetscHashIterGetKey(table_data_i, tpos, k);
797       PetscHashIterGetVal(table_data_i, tpos, j);
798       PetscHashIterNext(table_data_i, tpos);
799       tdata[--j] = --k;
800     }
801     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
802 #else
803     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
804 #endif
805     PetscCall(PetscCommDestroy(&iscomms[i]));
806   }
807 
808   PetscCall(PetscFree(iscomms));
809   PetscCall(PetscFree(onodes2));
810   PetscCall(PetscFree(olengths2));
811 
812   PetscCall(PetscFree(pa));
813   PetscCall(PetscFree(rbuf2[0]));
814   PetscCall(PetscFree(rbuf2));
815   PetscCall(PetscFree(s_waits1));
816   PetscCall(PetscFree(r_waits1));
817   PetscCall(PetscFree(s_waits2));
818   PetscCall(PetscFree(r_waits2));
819   PetscCall(PetscFree(recv_status));
820   if (xdata) {
821     PetscCall(PetscFree(xdata[0]));
822     PetscCall(PetscFree(xdata));
823   }
824   PetscCall(PetscFree(isz1));
825 #if defined(PETSC_USE_CTABLE)
826   for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
827   PetscCall(PetscFree(table_data));
828   PetscCall(PetscFree(tdata));
829   PetscCall(PetscFree4(table, data, isz, t_p));
830 #else
831   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
832 #endif
833   PetscFunctionReturn(PETSC_SUCCESS);
834 }
835 
836 /*
837    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
838        the work on the local processor.
839 
840      Inputs:
841       C      - MAT_MPIAIJ;
842       imax - total no of index sets processed at a time;
843       table  - an array of char - size = m bits.
844 
845      Output:
846       isz    - array containing the count of the solution elements corresponding
847                to each index set;
848       data or table_data  - pointer to the solutions
849 */
850 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
851 {
852   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
853   Mat         A = c->A, B = c->B;
854   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
855   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
856   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
857   PetscBT     table_i;
858 #if defined(PETSC_USE_CTABLE)
859   PetscHMapI    table_data_i;
860   PetscHashIter tpos;
861   PetscInt      tcount, *tdata;
862 #else
863   PetscInt *data_i;
864 #endif
865 
866   PetscFunctionBegin;
867   rstart = C->rmap->rstart;
868   cstart = C->cmap->rstart;
869   ai     = a->i;
870   aj     = a->j;
871   bi     = b->i;
872   bj     = b->j;
873   garray = c->garray;
874 
875   for (i = 0; i < imax; i++) {
876 #if defined(PETSC_USE_CTABLE)
877     /* copy existing entries of table_data_i into tdata[] */
878     table_data_i = table_data[i];
879     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
880     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
881 
882     PetscCall(PetscMalloc1(tcount, &tdata));
883     PetscHashIterBegin(table_data_i, tpos);
884     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
885       PetscHashIterGetKey(table_data_i, tpos, row);
886       PetscHashIterGetVal(table_data_i, tpos, j);
887       PetscHashIterNext(table_data_i, tpos);
888       tdata[--j] = --row;
889       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
890     }
891 #else
892     data_i = data[i];
893 #endif
894     table_i = table[i];
895     isz_i   = isz[i];
896     max     = isz[i];
897 
898     for (j = 0; j < max; j++) {
899 #if defined(PETSC_USE_CTABLE)
900       row = tdata[j] - rstart;
901 #else
902       row = data_i[j] - rstart;
903 #endif
904       start = ai[row];
905       end   = ai[row + 1];
906       for (k = start; k < end; k++) { /* Amat */
907         val = aj[k] + cstart;
908         if (!PetscBTLookupSet(table_i, val)) {
909 #if defined(PETSC_USE_CTABLE)
910           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
911 #else
912           data_i[isz_i] = val;
913 #endif
914           isz_i++;
915         }
916       }
917       start = bi[row];
918       end   = bi[row + 1];
919       for (k = start; k < end; k++) { /* Bmat */
920         val = garray[bj[k]];
921         if (!PetscBTLookupSet(table_i, val)) {
922 #if defined(PETSC_USE_CTABLE)
923           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
924 #else
925           data_i[isz_i] = val;
926 #endif
927           isz_i++;
928         }
929       }
930     }
931     isz[i] = isz_i;
932 
933 #if defined(PETSC_USE_CTABLE)
934     PetscCall(PetscFree(tdata));
935 #endif
936   }
937   PetscFunctionReturn(PETSC_SUCCESS);
938 }
939 
940 /*
941       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
942          and return the output
943 
944          Input:
945            C    - the matrix
946            nrqr - no of messages being processed.
947            rbuf - an array of pointers to the received requests
948 
949          Output:
950            xdata - array of messages to be sent back
951            isz1  - size of each message
952 
953   For better efficiency perhaps we should malloc separately each xdata[i],
954 then if a remalloc is required we need only copy the data for that one row
955 rather than all previous rows as it is now where a single large chunk of
956 memory is used.
957 
958 */
959 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
960 {
961   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
962   Mat         A = c->A, B = c->B;
963   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
964   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
965   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
966   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
967   PetscInt   *rbuf_i, kmax, rbuf_0;
968   PetscBT     xtable;
969 
970   PetscFunctionBegin;
971   m      = C->rmap->N;
972   rstart = C->rmap->rstart;
973   cstart = C->cmap->rstart;
974   ai     = a->i;
975   aj     = a->j;
976   bi     = b->i;
977   bj     = b->j;
978   garray = c->garray;
979 
980   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
981     rbuf_i = rbuf[i];
982     rbuf_0 = rbuf_i[0];
983     ct += rbuf_0;
984     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
985   }
986 
987   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
988   else max1 = 1;
989   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
990   if (nrqr) {
991     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
992     ++no_malloc;
993   }
994   PetscCall(PetscBTCreate(m, &xtable));
995   PetscCall(PetscArrayzero(isz1, nrqr));
996 
997   ct3 = 0;
998   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
999     rbuf_i = rbuf[i];
1000     rbuf_0 = rbuf_i[0];
1001     ct1    = 2 * rbuf_0 + 1;
1002     ct2    = ct1;
1003     ct3 += ct1;
1004     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1005       PetscCall(PetscBTMemzero(m, xtable));
1006       oct2 = ct2;
1007       kmax = rbuf_i[2 * j];
1008       for (k = 0; k < kmax; k++, ct1++) {
1009         row = rbuf_i[ct1];
1010         if (!PetscBTLookupSet(xtable, row)) {
1011           if (!(ct3 < mem_estimate)) {
1012             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1013             PetscCall(PetscMalloc1(new_estimate, &tmp));
1014             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1015             PetscCall(PetscFree(xdata[0]));
1016             xdata[0]     = tmp;
1017             mem_estimate = new_estimate;
1018             ++no_malloc;
1019             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1020           }
1021           xdata[i][ct2++] = row;
1022           ct3++;
1023         }
1024       }
1025       for (k = oct2, max2 = ct2; k < max2; k++) {
1026         row   = xdata[i][k] - rstart;
1027         start = ai[row];
1028         end   = ai[row + 1];
1029         for (l = start; l < end; l++) {
1030           val = aj[l] + cstart;
1031           if (!PetscBTLookupSet(xtable, val)) {
1032             if (!(ct3 < mem_estimate)) {
1033               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1034               PetscCall(PetscMalloc1(new_estimate, &tmp));
1035               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1036               PetscCall(PetscFree(xdata[0]));
1037               xdata[0]     = tmp;
1038               mem_estimate = new_estimate;
1039               ++no_malloc;
1040               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1041             }
1042             xdata[i][ct2++] = val;
1043             ct3++;
1044           }
1045         }
1046         start = bi[row];
1047         end   = bi[row + 1];
1048         for (l = start; l < end; l++) {
1049           val = garray[bj[l]];
1050           if (!PetscBTLookupSet(xtable, val)) {
1051             if (!(ct3 < mem_estimate)) {
1052               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1053               PetscCall(PetscMalloc1(new_estimate, &tmp));
1054               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1055               PetscCall(PetscFree(xdata[0]));
1056               xdata[0]     = tmp;
1057               mem_estimate = new_estimate;
1058               ++no_malloc;
1059               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1060             }
1061             xdata[i][ct2++] = val;
1062             ct3++;
1063           }
1064         }
1065       }
1066       /* Update the header*/
1067       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1068       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1069     }
1070     xdata[i][0] = rbuf_0;
1071     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1072     isz1[i] = ct2; /* size of each message */
1073   }
1074   PetscCall(PetscBTDestroy(&xtable));
1075   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
1079 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1080 /*
1081     Every processor gets the entire matrix
1082 */
1083 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1084 {
1085   Mat         B;
1086   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1087   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1088   PetscMPIInt size, rank;
1089   PetscInt    sendcount, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0;
1090   PetscInt    m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1091   PetscMPIInt iN, inrecv, inz;
1092 
1093   PetscFunctionBegin;
1094   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1095   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1096   if (scall == MAT_INITIAL_MATRIX) {
1097     /* Tell every processor the number of nonzeros per row */
1098     PetscCall(PetscCalloc1(A->rmap->N, &lens));
1099     for (PetscInt i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1100 
1101     /* All MPI processes get the same matrix */
1102     PetscCall(PetscMPIIntCast(A->rmap->N, &iN));
1103     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, lens, iN, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1104 
1105     /*     Create the sequential matrix of the same type as the local block diagonal  */
1106     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1107     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1108     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1109     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1110     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1111     PetscCall(PetscCalloc1(2, Bin));
1112     **Bin = B;
1113     b     = (Mat_SeqAIJ *)B->data;
1114 
1115     /* zero column space */
1116     nrecv = 0;
1117     for (PetscMPIInt i = 0; i < size; i++) {
1118       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j];
1119     }
1120     PetscCall(PetscArrayzero(b->j, nrecv));
1121 
1122     /*   Copy my part of matrix column indices over    */
1123     sendcount  = ad->nz + bd->nz;
1124     jsendbuf   = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]);
1125     a_jsendbuf = ad->j;
1126     b_jsendbuf = bd->j;
1127     n          = A->rmap->rend - A->rmap->rstart;
1128     cnt        = 0;
1129     for (PetscInt i = 0; i < n; i++) {
1130       /* put in lower diagonal portion */
1131       m = bd->i[i + 1] - bd->i[i];
1132       while (m > 0) {
1133         /* is it above diagonal (in bd (compressed) numbering) */
1134         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1135         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1136         m--;
1137       }
1138 
1139       /* put in diagonal portion */
1140       for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1141 
1142       /* put in upper diagonal portion */
1143       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1144     }
1145     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1146 
1147     /* send column indices, b->j was zeroed */
1148     PetscCall(PetscMPIIntCast(nrecv, &inrecv));
1149     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->j, inrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1150 
1151     /*  Assemble the matrix into useable form (numerical values not yet set) */
1152     /* set the b->ilen (length of each row) values */
1153     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1154     /* set the b->i indices */
1155     b->i[0] = 0;
1156     for (PetscInt i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1157     PetscCall(PetscFree(lens));
1158     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1159     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1160 
1161   } else {
1162     B = **Bin;
1163     b = (Mat_SeqAIJ *)B->data;
1164   }
1165 
1166   /* Copy my part of matrix numerical values into the values location */
1167   if (flag == MAT_GET_VALUES) {
1168     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1169     MatScalar         *sendbuf;
1170 
1171     /* initialize b->a */
1172     PetscCall(PetscArrayzero(b->a, b->nz));
1173 
1174     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1175     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1176     sendcount = ad->nz + bd->nz;
1177     sendbuf   = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]);
1178     a_sendbuf = ada;
1179     b_sendbuf = bda;
1180     b_sendj   = bd->j;
1181     n         = A->rmap->rend - A->rmap->rstart;
1182     cnt       = 0;
1183     for (PetscInt i = 0; i < n; i++) {
1184       /* put in lower diagonal portion */
1185       m = bd->i[i + 1] - bd->i[i];
1186       while (m > 0) {
1187         /* is it above diagonal (in bd (compressed) numbering) */
1188         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1189         sendbuf[cnt++] = *b_sendbuf++;
1190         m--;
1191         b_sendj++;
1192       }
1193 
1194       /* put in diagonal portion */
1195       for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1196 
1197       /* put in upper diagonal portion */
1198       while (m-- > 0) {
1199         sendbuf[cnt++] = *b_sendbuf++;
1200         b_sendj++;
1201       }
1202     }
1203     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1204 
1205     /* send values, b->a was zeroed */
1206     PetscCall(PetscMPIIntCast(b->nz, &inz));
1207     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->a, inz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1208 
1209     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1210     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1211   } /* endof (flag == MAT_GET_VALUES) */
1212 
1213   PetscCall(MatPropagateSymmetryOptions(A, B));
1214   PetscFunctionReturn(PETSC_SUCCESS);
1215 }
1216 
1217 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1218 {
1219   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1220   Mat             submat, A = c->A, B = c->B;
1221   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1222   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1223   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1224   const PetscInt *icol, *irow;
1225   PetscInt        nrow, ncol, start;
1226   PetscMPIInt     rank, size, *req_source1, *req_source2, tag1, tag2, tag3, tag4, *w1, *w2, nrqr, nrqs = 0, proc, *pa;
1227   PetscInt      **sbuf1, **sbuf2, k, ct1, ct2, ct3, **rbuf1, row;
1228   PetscInt        msz, **ptr, *req_size, *ctr, *tmp, tcol, *iptr;
1229   PetscInt      **rbuf3, **sbuf_aj, **rbuf2, max1, nnz;
1230   PetscInt       *lens, rmax, ncols, *cols, Crow;
1231 #if defined(PETSC_USE_CTABLE)
1232   PetscHMapI cmap, rmap;
1233   PetscInt  *cmap_loc, *rmap_loc;
1234 #else
1235   PetscInt *cmap, *rmap;
1236 #endif
1237   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i, jcnt;
1238   PetscInt     *cworkB, lwrite, *subcols, ib, jb;
1239   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1240   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1241   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1242   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1243   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1244   MPI_Comm      comm;
1245   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1246   PetscMPIInt  *onodes1, *olengths1, idex, end, *row2proc;
1247   Mat_SubSppt  *smatis1;
1248   PetscBool     isrowsorted, iscolsorted;
1249 
1250   PetscFunctionBegin;
1251   PetscValidLogicalCollectiveInt(C, ismax, 2);
1252   PetscValidLogicalCollectiveEnum(C, scall, 5);
1253   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1254   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1255   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1256   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1257   size = c->size;
1258   rank = c->rank;
1259 
1260   PetscCall(ISSorted(iscol[0], &iscolsorted));
1261   PetscCall(ISSorted(isrow[0], &isrowsorted));
1262   PetscCall(ISGetIndices(isrow[0], &irow));
1263   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1264   if (allcolumns) {
1265     icol = NULL;
1266     ncol = C->cmap->N;
1267   } else {
1268     PetscCall(ISGetIndices(iscol[0], &icol));
1269     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1270   }
1271 
1272   if (scall == MAT_INITIAL_MATRIX) {
1273     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1274 
1275     /* Get some new tags to keep the communication clean */
1276     tag1 = ((PetscObject)C)->tag;
1277     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1278     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1279 
1280     /* evaluate communication - mesg to who, length of mesg, and buffer space
1281      required. Based on this, buffers are allocated, and data copied into them */
1282     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1283     PetscCall(PetscMalloc1(nrow, &row2proc));
1284 
1285     /* w1[proc] = num of rows owned by proc -- to be requested */
1286     proc = 0;
1287     nrqs = 0; /* num of outgoing messages */
1288     for (PetscInt j = 0; j < nrow; j++) {
1289       row = irow[j];
1290       if (!isrowsorted) proc = 0;
1291       while (row >= C->rmap->range[proc + 1]) proc++;
1292       w1[proc]++;
1293       row2proc[j] = proc; /* map row index to proc */
1294 
1295       if (proc != rank && !w2[proc]) {
1296         w2[proc] = 1;
1297         nrqs++;
1298       }
1299     }
1300     w1[rank] = 0; /* rows owned by self will not be requested */
1301 
1302     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1303     for (PetscMPIInt proc = 0, j = 0; proc < size; proc++) {
1304       if (w1[proc]) pa[j++] = proc;
1305     }
1306 
1307     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1308     msz = 0; /* total mesg length (for all procs) */
1309     for (PetscMPIInt i = 0; i < nrqs; i++) {
1310       proc = pa[i];
1311       w1[proc] += 3;
1312       msz += w1[proc];
1313     }
1314     PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));
1315 
1316     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1317     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1318     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1319 
1320     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1321        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1322     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1323 
1324     /* Now post the Irecvs corresponding to these messages */
1325     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1326 
1327     PetscCall(PetscFree(onodes1));
1328     PetscCall(PetscFree(olengths1));
1329 
1330     /* Allocate Memory for outgoing messages */
1331     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1332     PetscCall(PetscArrayzero(sbuf1, size));
1333     PetscCall(PetscArrayzero(ptr, size));
1334 
1335     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1336     iptr = tmp;
1337     for (PetscMPIInt i = 0; i < nrqs; i++) {
1338       proc        = pa[i];
1339       sbuf1[proc] = iptr;
1340       iptr += w1[proc];
1341     }
1342 
1343     /* Form the outgoing messages */
1344     /* Initialize the header space */
1345     for (PetscMPIInt i = 0; i < nrqs; i++) {
1346       proc = pa[i];
1347       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1348       ptr[proc] = sbuf1[proc] + 3;
1349     }
1350 
1351     /* Parse the isrow and copy data into outbuf */
1352     PetscCall(PetscArrayzero(ctr, size));
1353     for (PetscInt j = 0; j < nrow; j++) { /* parse the indices of each IS */
1354       proc = row2proc[j];
1355       if (proc != rank) { /* copy to the outgoing buf*/
1356         *ptr[proc] = irow[j];
1357         ctr[proc]++;
1358         ptr[proc]++;
1359       }
1360     }
1361 
1362     /* Update the headers for the current IS */
1363     for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
1364       if ((ctr_j = ctr[j])) {
1365         sbuf1_j            = sbuf1[j];
1366         k                  = ++sbuf1_j[0];
1367         sbuf1_j[2 * k]     = ctr_j;
1368         sbuf1_j[2 * k - 1] = 0;
1369       }
1370     }
1371 
1372     /* Now post the sends */
1373     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1374     for (PetscMPIInt i = 0; i < nrqs; ++i) {
1375       proc = pa[i];
1376       PetscCallMPI(MPIU_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1377     }
1378 
1379     /* Post Receives to capture the buffer size */
1380     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1381     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1382 
1383     if (nrqs) rbuf2[0] = tmp + msz;
1384     for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1385 
1386     for (PetscMPIInt i = 0; i < nrqs; ++i) {
1387       proc = pa[i];
1388       PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1389     }
1390 
1391     PetscCall(PetscFree2(w1, w2));
1392 
1393     /* Send to other procs the buf size they should allocate */
1394     /* Receive messages*/
1395     PetscCall(PetscMalloc1(nrqr, &r_status1));
1396     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1397 
1398     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1399     for (PetscMPIInt i = 0; i < nrqr; ++i) {
1400       req_size[i] = 0;
1401       rbuf1_i     = rbuf1[i];
1402       start       = 2 * rbuf1_i[0] + 1;
1403       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1404       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1405       sbuf2_i = sbuf2[i];
1406       for (PetscInt j = start; j < end; j++) {
1407         k          = rbuf1_i[j] - rstart;
1408         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1409         sbuf2_i[j] = ncols;
1410         req_size[i] += ncols;
1411       }
1412       req_source1[i] = r_status1[i].MPI_SOURCE;
1413 
1414       /* form the header */
1415       sbuf2_i[0] = req_size[i];
1416       for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1417 
1418       PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1419     }
1420 
1421     PetscCall(PetscFree(r_status1));
1422     PetscCall(PetscFree(r_waits1));
1423 
1424     /* rbuf2 is received, Post recv column indices a->j */
1425     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1426 
1427     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1428     for (PetscMPIInt i = 0; i < nrqs; ++i) {
1429       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1430       req_source2[i] = r_status2[i].MPI_SOURCE;
1431       PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1432     }
1433 
1434     /* Wait on sends1 and sends2 */
1435     PetscCall(PetscMalloc1(nrqs, &s_status1));
1436     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1437     PetscCall(PetscFree(s_waits1));
1438     PetscCall(PetscFree(s_status1));
1439 
1440     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1441     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1442 
1443     /* Now allocate sending buffers for a->j, and send them off */
1444     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1445     jcnt = 0;
1446     for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1447     if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
1448     for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1449 
1450     for (PetscMPIInt i = 0; i < nrqr; i++) { /* for each requested message */
1451       rbuf1_i   = rbuf1[i];
1452       sbuf_aj_i = sbuf_aj[i];
1453       ct1       = 2 * rbuf1_i[0] + 1;
1454       ct2       = 0;
1455       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1456 
1457       kmax = rbuf1[i][2];
1458       for (PetscInt k = 0; k < kmax; k++, ct1++) { /* for each row */
1459         row    = rbuf1_i[ct1] - rstart;
1460         nzA    = ai[row + 1] - ai[row];
1461         nzB    = bi[row + 1] - bi[row];
1462         ncols  = nzA + nzB;
1463         cworkA = PetscSafePointerPlusOffset(aj, ai[row]);
1464         cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1465 
1466         /* load the column indices for this row into cols*/
1467         cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2);
1468 
1469         lwrite = 0;
1470         for (PetscInt l = 0; l < nzB; l++) {
1471           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1472         }
1473         for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1474         for (PetscInt l = 0; l < nzB; l++) {
1475           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1476         }
1477 
1478         ct2 += ncols;
1479       }
1480       PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1481     }
1482 
1483     /* create column map (cmap): global col of C -> local col of submat */
1484 #if defined(PETSC_USE_CTABLE)
1485     if (!allcolumns) {
1486       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1487       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1488       for (PetscInt j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1489         if (icol[j] >= cstart && icol[j] < cend) {
1490           cmap_loc[icol[j] - cstart] = j + 1;
1491         } else { /* use PetscHMapI for non-local col indices */
1492           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1493         }
1494       }
1495     } else {
1496       cmap     = NULL;
1497       cmap_loc = NULL;
1498     }
1499     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1500 #else
1501     if (!allcolumns) {
1502       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1503       for (PetscInt j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1504     } else {
1505       cmap = NULL;
1506     }
1507 #endif
1508 
1509     /* Create lens for MatSeqAIJSetPreallocation() */
1510     PetscCall(PetscCalloc1(nrow, &lens));
1511 
1512     /* Compute lens from local part of C */
1513     for (PetscInt j = 0; j < nrow; j++) {
1514       row  = irow[j];
1515       proc = row2proc[j];
1516       if (proc == rank) {
1517         /* diagonal part A = c->A */
1518         ncols = ai[row - rstart + 1] - ai[row - rstart];
1519         cols  = PetscSafePointerPlusOffset(aj, ai[row - rstart]);
1520         if (!allcolumns) {
1521           for (PetscInt k = 0; k < ncols; k++) {
1522 #if defined(PETSC_USE_CTABLE)
1523             tcol = cmap_loc[cols[k]];
1524 #else
1525             tcol = cmap[cols[k] + cstart];
1526 #endif
1527             if (tcol) lens[j]++;
1528           }
1529         } else { /* allcolumns */
1530           lens[j] = ncols;
1531         }
1532 
1533         /* off-diagonal part B = c->B */
1534         ncols = bi[row - rstart + 1] - bi[row - rstart];
1535         cols  = PetscSafePointerPlusOffset(bj, bi[row - rstart]);
1536         if (!allcolumns) {
1537           for (PetscInt k = 0; k < ncols; k++) {
1538 #if defined(PETSC_USE_CTABLE)
1539             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1540 #else
1541             tcol = cmap[bmap[cols[k]]];
1542 #endif
1543             if (tcol) lens[j]++;
1544           }
1545         } else { /* allcolumns */
1546           lens[j] += ncols;
1547         }
1548       }
1549     }
1550 
1551     /* Create row map (rmap): global row of C -> local row of submat */
1552 #if defined(PETSC_USE_CTABLE)
1553     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1554     for (PetscInt j = 0; j < nrow; j++) {
1555       row  = irow[j];
1556       proc = row2proc[j];
1557       if (proc == rank) { /* a local row */
1558         rmap_loc[row - rstart] = j;
1559       } else {
1560         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1561       }
1562     }
1563 #else
1564     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1565     for (PetscInt j = 0; j < nrow; j++) rmap[irow[j]] = j;
1566 #endif
1567 
1568     /* Update lens from offproc data */
1569     /* recv a->j is done */
1570     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1571     for (PetscMPIInt i = 0; i < nrqs; i++) {
1572       proc    = pa[i];
1573       sbuf1_i = sbuf1[proc];
1574       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1575       ct1     = 2 + 1;
1576       ct2     = 0;
1577       rbuf2_i = rbuf2[i]; /* received length of C->j */
1578       rbuf3_i = rbuf3[i]; /* received C->j */
1579 
1580       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1581       max1 = sbuf1_i[2];
1582       for (PetscInt k = 0; k < max1; k++, ct1++) {
1583 #if defined(PETSC_USE_CTABLE)
1584         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1585         row--;
1586         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1587 #else
1588         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1589 #endif
1590         /* Now, store row index of submat in sbuf1_i[ct1] */
1591         sbuf1_i[ct1] = row;
1592 
1593         nnz = rbuf2_i[ct1];
1594         if (!allcolumns) {
1595           for (PetscMPIInt l = 0; l < nnz; l++, ct2++) {
1596 #if defined(PETSC_USE_CTABLE)
1597             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1598               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1599             } else {
1600               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1601             }
1602 #else
1603             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1604 #endif
1605             if (tcol) lens[row]++;
1606           }
1607         } else { /* allcolumns */
1608           lens[row] += nnz;
1609         }
1610       }
1611     }
1612     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1613     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1614 
1615     /* Create the submatrices */
1616     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1617     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1618 
1619     PetscCall(ISGetBlockSize(isrow[0], &ib));
1620     PetscCall(ISGetBlockSize(iscol[0], &jb));
1621     if (ib > 1 || jb > 1) PetscCall(MatSetBlockSizes(submat, ib, jb));
1622     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1623     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1624 
1625     /* create struct Mat_SubSppt and attached it to submat */
1626     PetscCall(PetscNew(&smatis1));
1627     subc            = (Mat_SeqAIJ *)submat->data;
1628     subc->submatis1 = smatis1;
1629 
1630     smatis1->id          = 0;
1631     smatis1->nrqs        = nrqs;
1632     smatis1->nrqr        = nrqr;
1633     smatis1->rbuf1       = rbuf1;
1634     smatis1->rbuf2       = rbuf2;
1635     smatis1->rbuf3       = rbuf3;
1636     smatis1->sbuf2       = sbuf2;
1637     smatis1->req_source2 = req_source2;
1638 
1639     smatis1->sbuf1 = sbuf1;
1640     smatis1->ptr   = ptr;
1641     smatis1->tmp   = tmp;
1642     smatis1->ctr   = ctr;
1643 
1644     smatis1->pa          = pa;
1645     smatis1->req_size    = req_size;
1646     smatis1->req_source1 = req_source1;
1647 
1648     smatis1->allcolumns = allcolumns;
1649     smatis1->singleis   = PETSC_TRUE;
1650     smatis1->row2proc   = row2proc;
1651     smatis1->rmap       = rmap;
1652     smatis1->cmap       = cmap;
1653 #if defined(PETSC_USE_CTABLE)
1654     smatis1->rmap_loc = rmap_loc;
1655     smatis1->cmap_loc = cmap_loc;
1656 #endif
1657 
1658     smatis1->destroy     = submat->ops->destroy;
1659     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1660     submat->factortype   = C->factortype;
1661 
1662     /* compute rmax */
1663     rmax = 0;
1664     for (PetscMPIInt i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1665 
1666   } else { /* scall == MAT_REUSE_MATRIX */
1667     submat = submats[0];
1668     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1669 
1670     subc        = (Mat_SeqAIJ *)submat->data;
1671     rmax        = subc->rmax;
1672     smatis1     = subc->submatis1;
1673     nrqs        = smatis1->nrqs;
1674     nrqr        = smatis1->nrqr;
1675     rbuf1       = smatis1->rbuf1;
1676     rbuf2       = smatis1->rbuf2;
1677     rbuf3       = smatis1->rbuf3;
1678     req_source2 = smatis1->req_source2;
1679 
1680     sbuf1 = smatis1->sbuf1;
1681     sbuf2 = smatis1->sbuf2;
1682     ptr   = smatis1->ptr;
1683     tmp   = smatis1->tmp;
1684     ctr   = smatis1->ctr;
1685 
1686     pa          = smatis1->pa;
1687     req_size    = smatis1->req_size;
1688     req_source1 = smatis1->req_source1;
1689 
1690     allcolumns = smatis1->allcolumns;
1691     row2proc   = smatis1->row2proc;
1692     rmap       = smatis1->rmap;
1693     cmap       = smatis1->cmap;
1694 #if defined(PETSC_USE_CTABLE)
1695     rmap_loc = smatis1->rmap_loc;
1696     cmap_loc = smatis1->cmap_loc;
1697 #endif
1698   }
1699 
1700   /* Post recv matrix values */
1701   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1702   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1703   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1704   for (PetscMPIInt i = 0; i < nrqs; ++i) {
1705     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1706     PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1707   }
1708 
1709   /* Allocate sending buffers for a->a, and send them off */
1710   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1711   jcnt = 0;
1712   for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1713   if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
1714   for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1715 
1716   for (PetscMPIInt i = 0; i < nrqr; i++) {
1717     rbuf1_i   = rbuf1[i];
1718     sbuf_aa_i = sbuf_aa[i];
1719     ct1       = 2 * rbuf1_i[0] + 1;
1720     ct2       = 0;
1721     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1722 
1723     kmax = rbuf1_i[2];
1724     for (PetscInt k = 0; k < kmax; k++, ct1++) {
1725       row    = rbuf1_i[ct1] - rstart;
1726       nzA    = ai[row + 1] - ai[row];
1727       nzB    = bi[row + 1] - bi[row];
1728       ncols  = nzA + nzB;
1729       cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1730       vworkA = PetscSafePointerPlusOffset(a_a, ai[row]);
1731       vworkB = PetscSafePointerPlusOffset(b_a, bi[row]);
1732 
1733       /* load the column values for this row into vals*/
1734       vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2);
1735 
1736       lwrite = 0;
1737       for (PetscInt l = 0; l < nzB; l++) {
1738         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1739       }
1740       for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1741       for (PetscInt l = 0; l < nzB; l++) {
1742         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1743       }
1744 
1745       ct2 += ncols;
1746     }
1747     PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1748   }
1749 
1750   /* Assemble submat */
1751   /* First assemble the local rows */
1752   for (PetscInt j = 0; j < nrow; j++) {
1753     row  = irow[j];
1754     proc = row2proc[j];
1755     if (proc == rank) {
1756       Crow = row - rstart; /* local row index of C */
1757 #if defined(PETSC_USE_CTABLE)
1758       row = rmap_loc[Crow]; /* row index of submat */
1759 #else
1760       row = rmap[row];
1761 #endif
1762 
1763       if (allcolumns) {
1764         PetscInt ncol = 0;
1765 
1766         /* diagonal part A = c->A */
1767         ncols = ai[Crow + 1] - ai[Crow];
1768         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1769         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1770         for (PetscInt k = 0; k < ncols; k++) {
1771           subcols[ncol]   = cols[k] + cstart;
1772           subvals[ncol++] = vals[k];
1773         }
1774 
1775         /* off-diagonal part B = c->B */
1776         ncols = bi[Crow + 1] - bi[Crow];
1777         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1778         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1779         for (PetscInt k = 0; k < ncols; k++) {
1780           subcols[ncol]   = bmap[cols[k]];
1781           subvals[ncol++] = vals[k];
1782         }
1783 
1784         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));
1785 
1786       } else { /* !allcolumns */
1787         PetscInt ncol = 0;
1788 
1789 #if defined(PETSC_USE_CTABLE)
1790         /* diagonal part A = c->A */
1791         ncols = ai[Crow + 1] - ai[Crow];
1792         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1793         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1794         for (PetscInt k = 0; k < ncols; k++) {
1795           tcol = cmap_loc[cols[k]];
1796           if (tcol) {
1797             subcols[ncol]   = --tcol;
1798             subvals[ncol++] = vals[k];
1799           }
1800         }
1801 
1802         /* off-diagonal part B = c->B */
1803         ncols = bi[Crow + 1] - bi[Crow];
1804         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1805         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1806         for (PetscInt k = 0; k < ncols; k++) {
1807           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1808           if (tcol) {
1809             subcols[ncol]   = --tcol;
1810             subvals[ncol++] = vals[k];
1811           }
1812         }
1813 #else
1814         /* diagonal part A = c->A */
1815         ncols = ai[Crow + 1] - ai[Crow];
1816         cols  = aj + ai[Crow];
1817         vals  = a_a + ai[Crow];
1818         for (PetscInt k = 0; k < ncols; k++) {
1819           tcol = cmap[cols[k] + cstart];
1820           if (tcol) {
1821             subcols[ncol]   = --tcol;
1822             subvals[ncol++] = vals[k];
1823           }
1824         }
1825 
1826         /* off-diagonal part B = c->B */
1827         ncols = bi[Crow + 1] - bi[Crow];
1828         cols  = bj + bi[Crow];
1829         vals  = b_a + bi[Crow];
1830         for (PetscInt k = 0; k < ncols; k++) {
1831           tcol = cmap[bmap[cols[k]]];
1832           if (tcol) {
1833             subcols[ncol]   = --tcol;
1834             subvals[ncol++] = vals[k];
1835           }
1836         }
1837 #endif
1838         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));
1839       }
1840     }
1841   }
1842 
1843   /* Now assemble the off-proc rows */
1844   for (PetscMPIInt i = 0; i < nrqs; i++) { /* for each requested message */
1845     /* recv values from other processes */
1846     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1847     proc    = pa[idex];
1848     sbuf1_i = sbuf1[proc];
1849     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1850     ct1     = 2 + 1;
1851     ct2     = 0;           /* count of received C->j */
1852     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1853     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1854     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1855     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1856 
1857     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1858     max1 = sbuf1_i[2];                           /* num of rows */
1859     for (PetscInt k = 0; k < max1; k++, ct1++) { /* for each recved row */
1860       row = sbuf1_i[ct1];                        /* row index of submat */
1861       if (!allcolumns) {
1862         idex = 0;
1863         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1864           nnz = rbuf2_i[ct1];                         /* num of C entries in this row */
1865           for (PetscInt l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1866 #if defined(PETSC_USE_CTABLE)
1867             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1868               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1869             } else {
1870               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1871             }
1872 #else
1873             tcol = cmap[rbuf3_i[ct2]];
1874 #endif
1875             if (tcol) {
1876               subcols[idex]   = --tcol; /* may not be sorted */
1877               subvals[idex++] = rbuf4_i[ct2];
1878 
1879               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1880                For reuse, we replace received C->j with index that should be inserted to submat */
1881               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1882             }
1883           }
1884           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1885         } else { /* scall == MAT_REUSE_MATRIX */
1886           submat = submats[0];
1887           subc   = (Mat_SeqAIJ *)submat->data;
1888 
1889           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1890           for (PetscInt l = 0; l < nnz; l++) {
1891             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1892             subvals[idex++] = rbuf4_i[ct2];
1893           }
1894 
1895           bj = subc->j + subc->i[row]; /* sorted column indices */
1896           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1897         }
1898       } else {              /* allcolumns */
1899         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1900         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES));
1901         ct2 += nnz;
1902       }
1903     }
1904   }
1905 
1906   /* sending a->a are done */
1907   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1908   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1909 
1910   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1911   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1912   submats[0] = submat;
1913 
1914   /* Restore the indices */
1915   PetscCall(ISRestoreIndices(isrow[0], &irow));
1916   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1917 
1918   /* Destroy allocated memory */
1919   for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1920   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1921   if (sbuf_aa) {
1922     PetscCall(PetscFree(sbuf_aa[0]));
1923     PetscCall(PetscFree(sbuf_aa));
1924   }
1925 
1926   if (scall == MAT_INITIAL_MATRIX) {
1927     PetscCall(PetscFree(lens));
1928     if (sbuf_aj) {
1929       PetscCall(PetscFree(sbuf_aj[0]));
1930       PetscCall(PetscFree(sbuf_aj));
1931     }
1932   }
1933   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1934   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1935   PetscFunctionReturn(PETSC_SUCCESS);
1936 }
1937 
1938 static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1939 {
1940   PetscInt  ncol;
1941   PetscBool colflag, allcolumns = PETSC_FALSE;
1942 
1943   PetscFunctionBegin;
1944   /* Allocate memory to hold all the submatrices */
1945   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1946 
1947   /* Check for special case: each processor gets entire matrix columns */
1948   PetscCall(ISIdentity(iscol[0], &colflag));
1949   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1950   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1951 
1952   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1953   PetscFunctionReturn(PETSC_SUCCESS);
1954 }
1955 
1956 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1957 {
1958   PetscInt     nmax, nstages = 0, max_no, nrow, ncol, in[2], out[2];
1959   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1960   Mat_SeqAIJ  *subc;
1961   Mat_SubSppt *smat;
1962 
1963   PetscFunctionBegin;
1964   /* Check for special case: each processor has a single IS */
1965   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1966     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1967     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1968     PetscFunctionReturn(PETSC_SUCCESS);
1969   }
1970 
1971   /* Collect global wantallmatrix and nstages */
1972   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1973   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1974   if (!nmax) nmax = 1;
1975 
1976   if (scall == MAT_INITIAL_MATRIX) {
1977     /* Collect global wantallmatrix and nstages */
1978     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1979       PetscCall(ISIdentity(*isrow, &rowflag));
1980       PetscCall(ISIdentity(*iscol, &colflag));
1981       PetscCall(ISGetLocalSize(*isrow, &nrow));
1982       PetscCall(ISGetLocalSize(*iscol, &ncol));
1983       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1984         wantallmatrix = PETSC_TRUE;
1985 
1986         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1987       }
1988     }
1989 
1990     /* Determine the number of stages through which submatrices are done
1991        Each stage will extract nmax submatrices.
1992        nmax is determined by the matrix column dimension.
1993        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1994     */
1995     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1996 
1997     in[0] = -1 * (PetscInt)wantallmatrix;
1998     in[1] = nstages;
1999     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
2000     wantallmatrix = (PetscBool)(-out[0]);
2001     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2002 
2003   } else { /* MAT_REUSE_MATRIX */
2004     if (ismax) {
2005       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2006       smat = subc->submatis1;
2007     } else { /* (*submat)[0] is a dummy matrix */
2008       smat = (Mat_SubSppt *)(*submat)[0]->data;
2009     }
2010     if (!smat) {
2011       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2012       wantallmatrix = PETSC_TRUE;
2013     } else if (smat->singleis) {
2014       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2015       PetscFunctionReturn(PETSC_SUCCESS);
2016     } else {
2017       nstages = smat->nstages;
2018     }
2019   }
2020 
2021   if (wantallmatrix) {
2022     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2023     PetscFunctionReturn(PETSC_SUCCESS);
2024   }
2025 
2026   /* Allocate memory to hold all the submatrices and dummy submatrices */
2027   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2028 
2029   for (PetscInt i = 0, pos = 0; i < nstages; i++) {
2030     if (pos + nmax <= ismax) max_no = nmax;
2031     else if (pos >= ismax) max_no = 0;
2032     else max_no = ismax - pos;
2033 
2034     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos));
2035     if (!max_no) {
2036       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2037         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2038         smat->nstages = nstages;
2039       }
2040       pos++; /* advance to next dummy matrix if any */
2041     } else pos += max_no;
2042   }
2043 
2044   if (ismax && scall == MAT_INITIAL_MATRIX) {
2045     /* save nstages for reuse */
2046     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2047     smat          = subc->submatis1;
2048     smat->nstages = nstages;
2049   }
2050   PetscFunctionReturn(PETSC_SUCCESS);
2051 }
2052 
2053 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2054 {
2055   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2056   Mat              A = c->A;
2057   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2058   const PetscInt **icol, **irow;
2059   PetscInt        *nrow, *ncol, start;
2060   PetscMPIInt      nrqs = 0, *pa, proc = -1;
2061   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr, *req_source1 = NULL, *req_source2;
2062   PetscInt       **sbuf1, **sbuf2, k, ct1, ct2, **rbuf1, row;
2063   PetscInt         msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *tmp = NULL, tcol;
2064   PetscInt       **rbuf3 = NULL, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2065   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2066 #if defined(PETSC_USE_CTABLE)
2067   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2068 #else
2069   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2070 #endif
2071   const PetscInt *irow_i;
2072   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2073   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2074   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2075   MPI_Comm        comm;
2076   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2077   PetscMPIInt    *onodes1, *olengths1, end, **row2proc, *row2proc_i;
2078   PetscInt        ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2079   Mat_SubSppt    *smat_i;
2080   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2081   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen, jcnt;
2082 
2083   PetscFunctionBegin;
2084   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2085   size = c->size;
2086   rank = c->rank;
2087 
2088   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2089   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2090 
2091   for (PetscInt i = 0; i < ismax; i++) {
2092     PetscCall(ISSorted(iscol[i], &issorted[i]));
2093     if (!issorted[i]) iscsorted = issorted[i];
2094 
2095     PetscCall(ISSorted(isrow[i], &issorted[i]));
2096 
2097     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2098     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2099 
2100     /* Check for special case: allcolumn */
2101     PetscCall(ISIdentity(iscol[i], &colflag));
2102     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2103     if (colflag && ncol[i] == C->cmap->N) {
2104       allcolumns[i] = PETSC_TRUE;
2105       icol[i]       = NULL;
2106     } else {
2107       allcolumns[i] = PETSC_FALSE;
2108       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2109     }
2110   }
2111 
2112   if (scall == MAT_REUSE_MATRIX) {
2113     /* Assumes new rows are same length as the old rows */
2114     for (PetscInt i = 0; i < ismax; i++) {
2115       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2116       subc = (Mat_SeqAIJ *)submats[i]->data;
2117       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2118 
2119       /* Initial matrix as if empty */
2120       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2121 
2122       smat_i = subc->submatis1;
2123 
2124       nrqs        = smat_i->nrqs;
2125       nrqr        = smat_i->nrqr;
2126       rbuf1       = smat_i->rbuf1;
2127       rbuf2       = smat_i->rbuf2;
2128       rbuf3       = smat_i->rbuf3;
2129       req_source2 = smat_i->req_source2;
2130 
2131       sbuf1 = smat_i->sbuf1;
2132       sbuf2 = smat_i->sbuf2;
2133       ptr   = smat_i->ptr;
2134       tmp   = smat_i->tmp;
2135       ctr   = smat_i->ctr;
2136 
2137       pa          = smat_i->pa;
2138       req_size    = smat_i->req_size;
2139       req_source1 = smat_i->req_source1;
2140 
2141       allcolumns[i] = smat_i->allcolumns;
2142       row2proc[i]   = smat_i->row2proc;
2143       rmap[i]       = smat_i->rmap;
2144       cmap[i]       = smat_i->cmap;
2145     }
2146 
2147     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2148       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2149       smat_i = (Mat_SubSppt *)submats[0]->data;
2150 
2151       nrqs        = smat_i->nrqs;
2152       nrqr        = smat_i->nrqr;
2153       rbuf1       = smat_i->rbuf1;
2154       rbuf2       = smat_i->rbuf2;
2155       rbuf3       = smat_i->rbuf3;
2156       req_source2 = smat_i->req_source2;
2157 
2158       sbuf1 = smat_i->sbuf1;
2159       sbuf2 = smat_i->sbuf2;
2160       ptr   = smat_i->ptr;
2161       tmp   = smat_i->tmp;
2162       ctr   = smat_i->ctr;
2163 
2164       pa          = smat_i->pa;
2165       req_size    = smat_i->req_size;
2166       req_source1 = smat_i->req_source1;
2167 
2168       allcolumns[0] = PETSC_FALSE;
2169     }
2170   } else { /* scall == MAT_INITIAL_MATRIX */
2171     /* Get some new tags to keep the communication clean */
2172     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2173     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2174 
2175     /* evaluate communication - mesg to who, length of mesg, and buffer space
2176      required. Based on this, buffers are allocated, and data copied into them*/
2177     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2178 
2179     for (PetscInt i = 0; i < ismax; i++) {
2180       jmax   = nrow[i];
2181       irow_i = irow[i];
2182 
2183       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2184       row2proc[i] = row2proc_i;
2185 
2186       if (issorted[i]) proc = 0;
2187       for (PetscInt j = 0; j < jmax; j++) {
2188         if (!issorted[i]) proc = 0;
2189         row = irow_i[j];
2190         while (row >= C->rmap->range[proc + 1]) proc++;
2191         w4[proc]++;
2192         row2proc_i[j] = proc; /* map row index to proc */
2193       }
2194       for (PetscMPIInt j = 0; j < size; j++) {
2195         if (w4[j]) {
2196           w1[j] += w4[j];
2197           w3[j]++;
2198           w4[j] = 0;
2199         }
2200       }
2201     }
2202 
2203     nrqs     = 0; /* no of outgoing messages */
2204     msz      = 0; /* total mesg length (for all procs) */
2205     w1[rank] = 0; /* no mesg sent to self */
2206     w3[rank] = 0;
2207     for (PetscMPIInt i = 0; i < size; i++) {
2208       if (w1[i]) {
2209         w2[i] = 1;
2210         nrqs++;
2211       } /* there exists a message to proc i */
2212     }
2213     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2214     for (PetscMPIInt i = 0, j = 0; i < size; i++) {
2215       if (w1[i]) {
2216         pa[j] = i;
2217         j++;
2218       }
2219     }
2220 
2221     /* Each message would have a header = 1 + 2*(no of IS) + data */
2222     for (PetscMPIInt i = 0; i < nrqs; i++) {
2223       PetscMPIInt j = pa[i];
2224       w1[j] += w2[j] + 2 * w3[j];
2225       msz += w1[j];
2226     }
2227     PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));
2228 
2229     /* Determine the number of messages to expect, their lengths, from from-ids */
2230     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2231     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2232 
2233     /* Now post the Irecvs corresponding to these messages */
2234     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2235     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2236 
2237     /* Allocate Memory for outgoing messages */
2238     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2239     PetscCall(PetscArrayzero(sbuf1, size));
2240     PetscCall(PetscArrayzero(ptr, size));
2241 
2242     {
2243       PetscInt *iptr = tmp;
2244       k              = 0;
2245       for (PetscMPIInt i = 0; i < nrqs; i++) {
2246         PetscMPIInt j = pa[i];
2247         iptr += k;
2248         sbuf1[j] = iptr;
2249         k        = w1[j];
2250       }
2251     }
2252 
2253     /* Form the outgoing messages. Initialize the header space */
2254     for (PetscMPIInt i = 0; i < nrqs; i++) {
2255       PetscMPIInt j = pa[i];
2256       sbuf1[j][0]   = 0;
2257       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2258       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2259     }
2260 
2261     /* Parse the isrow and copy data into outbuf */
2262     for (PetscInt i = 0; i < ismax; i++) {
2263       row2proc_i = row2proc[i];
2264       PetscCall(PetscArrayzero(ctr, size));
2265       irow_i = irow[i];
2266       jmax   = nrow[i];
2267       for (PetscInt j = 0; j < jmax; j++) { /* parse the indices of each IS */
2268         proc = row2proc_i[j];
2269         if (proc != rank) { /* copy to the outgoing buf*/
2270           ctr[proc]++;
2271           *ptr[proc] = irow_i[j];
2272           ptr[proc]++;
2273         }
2274       }
2275       /* Update the headers for the current IS */
2276       for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
2277         if ((ctr_j = ctr[j])) {
2278           sbuf1_j            = sbuf1[j];
2279           k                  = ++sbuf1_j[0];
2280           sbuf1_j[2 * k]     = ctr_j;
2281           sbuf1_j[2 * k - 1] = i;
2282         }
2283       }
2284     }
2285 
2286     /*  Now  post the sends */
2287     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2288     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2289       PetscMPIInt j = pa[i];
2290       PetscCallMPI(MPIU_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2291     }
2292 
2293     /* Post Receives to capture the buffer size */
2294     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2295     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2296     if (nrqs) rbuf2[0] = tmp + msz;
2297     for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2298     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2299       PetscMPIInt j = pa[i];
2300       PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2301     }
2302 
2303     /* Send to other procs the buf size they should allocate */
2304     /* Receive messages*/
2305     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2306     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2307     {
2308       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2309       PetscInt *sbuf2_i;
2310 
2311       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2312       for (PetscMPIInt i = 0; i < nrqr; ++i) {
2313         req_size[i] = 0;
2314         rbuf1_i     = rbuf1[i];
2315         start       = 2 * rbuf1_i[0] + 1;
2316         end         = olengths1[i];
2317         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2318         sbuf2_i = sbuf2[i];
2319         for (PetscInt j = start; j < end; j++) {
2320           id         = rbuf1_i[j] - rstart;
2321           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2322           sbuf2_i[j] = ncols;
2323           req_size[i] += ncols;
2324         }
2325         req_source1[i] = onodes1[i];
2326         /* form the header */
2327         sbuf2_i[0] = req_size[i];
2328         for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2329 
2330         PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2331       }
2332     }
2333 
2334     PetscCall(PetscFree(onodes1));
2335     PetscCall(PetscFree(olengths1));
2336     PetscCall(PetscFree(r_waits1));
2337     PetscCall(PetscFree4(w1, w2, w3, w4));
2338 
2339     /* Receive messages*/
2340     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2341     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2342     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2343       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2344       req_source2[i] = pa[i];
2345       PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2346     }
2347     PetscCall(PetscFree(r_waits2));
2348 
2349     /* Wait on sends1 and sends2 */
2350     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2351     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2352     PetscCall(PetscFree(s_waits1));
2353     PetscCall(PetscFree(s_waits2));
2354 
2355     /* Now allocate sending buffers for a->j, and send them off */
2356     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2357     jcnt = 0;
2358     for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2359     if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
2360     for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2361 
2362     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2363     {
2364       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2365       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2366       PetscInt  cend = C->cmap->rend;
2367       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2368 
2369       for (PetscMPIInt i = 0; i < nrqr; i++) {
2370         rbuf1_i   = rbuf1[i];
2371         sbuf_aj_i = sbuf_aj[i];
2372         ct1       = 2 * rbuf1_i[0] + 1;
2373         ct2       = 0;
2374         for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2375           kmax = rbuf1[i][2 * j];
2376           for (PetscInt k = 0; k < kmax; k++, ct1++) {
2377             row    = rbuf1_i[ct1] - rstart;
2378             nzA    = a_i[row + 1] - a_i[row];
2379             nzB    = b_i[row + 1] - b_i[row];
2380             ncols  = nzA + nzB;
2381             cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]);
2382             cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2383 
2384             /* load the column indices for this row into cols */
2385             cols = sbuf_aj_i + ct2;
2386 
2387             lwrite = 0;
2388             for (PetscInt l = 0; l < nzB; l++) {
2389               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2390             }
2391             for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2392             for (PetscInt l = 0; l < nzB; l++) {
2393               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2394             }
2395 
2396             ct2 += ncols;
2397           }
2398         }
2399         PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2400       }
2401     }
2402 
2403     /* create col map: global col of C -> local col of submatrices */
2404     {
2405       const PetscInt *icol_i;
2406 #if defined(PETSC_USE_CTABLE)
2407       for (PetscInt i = 0; i < ismax; i++) {
2408         if (!allcolumns[i]) {
2409           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2410 
2411           jmax   = ncol[i];
2412           icol_i = icol[i];
2413           cmap_i = cmap[i];
2414           for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2415         } else cmap[i] = NULL;
2416       }
2417 #else
2418       for (PetscInt i = 0; i < ismax; i++) {
2419         if (!allcolumns[i]) {
2420           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2421           jmax   = ncol[i];
2422           icol_i = icol[i];
2423           cmap_i = cmap[i];
2424           for (PetscInt j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2425         } else cmap[i] = NULL;
2426       }
2427 #endif
2428     }
2429 
2430     /* Create lens which is required for MatCreate... */
2431     jcnt = 0;
2432     for (PetscInt i = 0; i < ismax; i++) jcnt += nrow[i];
2433     PetscCall(PetscMalloc1(ismax, &lens));
2434 
2435     if (ismax) PetscCall(PetscCalloc1(jcnt, &lens[0]));
2436     for (PetscInt i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]);
2437 
2438     /* Update lens from local data */
2439     for (PetscInt i = 0; i < ismax; i++) {
2440       row2proc_i = row2proc[i];
2441       jmax       = nrow[i];
2442       if (!allcolumns[i]) cmap_i = cmap[i];
2443       irow_i = irow[i];
2444       lens_i = lens[i];
2445       for (PetscInt j = 0; j < jmax; j++) {
2446         row  = irow_i[j];
2447         proc = row2proc_i[j];
2448         if (proc == rank) {
2449           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2450           if (!allcolumns[i]) {
2451             for (PetscInt k = 0; k < ncols; k++) {
2452 #if defined(PETSC_USE_CTABLE)
2453               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2454 #else
2455               tcol = cmap_i[cols[k]];
2456 #endif
2457               if (tcol) lens_i[j]++;
2458             }
2459           } else { /* allcolumns */
2460             lens_i[j] = ncols;
2461           }
2462           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2463         }
2464       }
2465     }
2466 
2467     /* Create row map: global row of C -> local row of submatrices */
2468 #if defined(PETSC_USE_CTABLE)
2469     for (PetscInt i = 0; i < ismax; i++) {
2470       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2471       irow_i = irow[i];
2472       jmax   = nrow[i];
2473       for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2474     }
2475 #else
2476     for (PetscInt i = 0; i < ismax; i++) {
2477       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2478       rmap_i = rmap[i];
2479       irow_i = irow[i];
2480       jmax   = nrow[i];
2481       for (PetscInt j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2482     }
2483 #endif
2484 
2485     /* Update lens from offproc data */
2486     {
2487       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2488 
2489       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2490       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2491         sbuf1_i = sbuf1[pa[tmp2]];
2492         jmax    = sbuf1_i[0];
2493         ct1     = 2 * jmax + 1;
2494         ct2     = 0;
2495         rbuf2_i = rbuf2[tmp2];
2496         rbuf3_i = rbuf3[tmp2];
2497         for (PetscInt j = 1; j <= jmax; j++) {
2498           is_no  = sbuf1_i[2 * j - 1];
2499           max1   = sbuf1_i[2 * j];
2500           lens_i = lens[is_no];
2501           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2502           rmap_i = rmap[is_no];
2503           for (PetscInt k = 0; k < max1; k++, ct1++) {
2504 #if defined(PETSC_USE_CTABLE)
2505             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2506             row--;
2507             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2508 #else
2509             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2510 #endif
2511             max2 = rbuf2_i[ct1];
2512             for (PetscInt l = 0; l < max2; l++, ct2++) {
2513               if (!allcolumns[is_no]) {
2514 #if defined(PETSC_USE_CTABLE)
2515                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2516 #else
2517                 tcol = cmap_i[rbuf3_i[ct2]];
2518 #endif
2519                 if (tcol) lens_i[row]++;
2520               } else {         /* allcolumns */
2521                 lens_i[row]++; /* lens_i[row] += max2 ? */
2522               }
2523             }
2524           }
2525         }
2526       }
2527     }
2528     PetscCall(PetscFree(r_waits3));
2529     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2530     PetscCall(PetscFree(s_waits3));
2531 
2532     /* Create the submatrices */
2533     for (PetscInt i = 0; i < ismax; i++) {
2534       PetscInt rbs, cbs;
2535 
2536       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2537       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2538 
2539       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2540       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2541 
2542       if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2543       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2544       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2545 
2546       /* create struct Mat_SubSppt and attached it to submat */
2547       PetscCall(PetscNew(&smat_i));
2548       subc            = (Mat_SeqAIJ *)submats[i]->data;
2549       subc->submatis1 = smat_i;
2550 
2551       smat_i->destroy          = submats[i]->ops->destroy;
2552       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2553       submats[i]->factortype   = C->factortype;
2554 
2555       smat_i->id          = i;
2556       smat_i->nrqs        = nrqs;
2557       smat_i->nrqr        = nrqr;
2558       smat_i->rbuf1       = rbuf1;
2559       smat_i->rbuf2       = rbuf2;
2560       smat_i->rbuf3       = rbuf3;
2561       smat_i->sbuf2       = sbuf2;
2562       smat_i->req_source2 = req_source2;
2563 
2564       smat_i->sbuf1 = sbuf1;
2565       smat_i->ptr   = ptr;
2566       smat_i->tmp   = tmp;
2567       smat_i->ctr   = ctr;
2568 
2569       smat_i->pa          = pa;
2570       smat_i->req_size    = req_size;
2571       smat_i->req_source1 = req_source1;
2572 
2573       smat_i->allcolumns = allcolumns[i];
2574       smat_i->singleis   = PETSC_FALSE;
2575       smat_i->row2proc   = row2proc[i];
2576       smat_i->rmap       = rmap[i];
2577       smat_i->cmap       = cmap[i];
2578     }
2579 
2580     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2581       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2582       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2583       PetscCall(MatSetType(submats[0], MATDUMMY));
2584 
2585       /* create struct Mat_SubSppt and attached it to submat */
2586       PetscCall(PetscNew(&smat_i));
2587       submats[0]->data = (void *)smat_i;
2588 
2589       smat_i->destroy          = submats[0]->ops->destroy;
2590       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2591       submats[0]->factortype   = C->factortype;
2592 
2593       smat_i->id          = 0;
2594       smat_i->nrqs        = nrqs;
2595       smat_i->nrqr        = nrqr;
2596       smat_i->rbuf1       = rbuf1;
2597       smat_i->rbuf2       = rbuf2;
2598       smat_i->rbuf3       = rbuf3;
2599       smat_i->sbuf2       = sbuf2;
2600       smat_i->req_source2 = req_source2;
2601 
2602       smat_i->sbuf1 = sbuf1;
2603       smat_i->ptr   = ptr;
2604       smat_i->tmp   = tmp;
2605       smat_i->ctr   = ctr;
2606 
2607       smat_i->pa          = pa;
2608       smat_i->req_size    = req_size;
2609       smat_i->req_source1 = req_source1;
2610 
2611       smat_i->allcolumns = PETSC_FALSE;
2612       smat_i->singleis   = PETSC_FALSE;
2613       smat_i->row2proc   = NULL;
2614       smat_i->rmap       = NULL;
2615       smat_i->cmap       = NULL;
2616     }
2617 
2618     if (ismax) PetscCall(PetscFree(lens[0]));
2619     PetscCall(PetscFree(lens));
2620     if (sbuf_aj) {
2621       PetscCall(PetscFree(sbuf_aj[0]));
2622       PetscCall(PetscFree(sbuf_aj));
2623     }
2624 
2625   } /* endof scall == MAT_INITIAL_MATRIX */
2626 
2627   /* Post recv matrix values */
2628   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2629   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2630   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2631   for (PetscMPIInt i = 0; i < nrqs; ++i) {
2632     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2633     PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2634   }
2635 
2636   /* Allocate sending buffers for a->a, and send them off */
2637   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2638   jcnt = 0;
2639   for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2640   if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
2641   for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2642 
2643   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2644   {
2645     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2646     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2647     PetscInt     cend = C->cmap->rend;
2648     PetscInt    *b_j  = b->j;
2649     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2650 
2651     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2652     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2653     for (PetscMPIInt i = 0; i < nrqr; i++) {
2654       rbuf1_i   = rbuf1[i];
2655       sbuf_aa_i = sbuf_aa[i];
2656       ct1       = 2 * rbuf1_i[0] + 1;
2657       ct2       = 0;
2658       for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2659         kmax = rbuf1_i[2 * j];
2660         for (PetscInt k = 0; k < kmax; k++, ct1++) {
2661           row    = rbuf1_i[ct1] - rstart;
2662           nzA    = a_i[row + 1] - a_i[row];
2663           nzB    = b_i[row + 1] - b_i[row];
2664           ncols  = nzA + nzB;
2665           cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2666           vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]);
2667           vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]);
2668 
2669           /* load the column values for this row into vals*/
2670           vals = sbuf_aa_i + ct2;
2671 
2672           lwrite = 0;
2673           for (PetscInt l = 0; l < nzB; l++) {
2674             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2675           }
2676           for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2677           for (PetscInt l = 0; l < nzB; l++) {
2678             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2679           }
2680 
2681           ct2 += ncols;
2682         }
2683       }
2684       PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2685     }
2686     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2687     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2688   }
2689 
2690   /* Assemble the matrices */
2691   /* First assemble the local rows */
2692   for (PetscInt i = 0; i < ismax; i++) {
2693     row2proc_i = row2proc[i];
2694     subc       = (Mat_SeqAIJ *)submats[i]->data;
2695     imat_ilen  = subc->ilen;
2696     imat_j     = subc->j;
2697     imat_i     = subc->i;
2698     imat_a     = subc->a;
2699 
2700     if (!allcolumns[i]) cmap_i = cmap[i];
2701     rmap_i = rmap[i];
2702     irow_i = irow[i];
2703     jmax   = nrow[i];
2704     for (PetscInt j = 0; j < jmax; j++) {
2705       row  = irow_i[j];
2706       proc = row2proc_i[j];
2707       if (proc == rank) {
2708         old_row = row;
2709 #if defined(PETSC_USE_CTABLE)
2710         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2711         row--;
2712 #else
2713         row = rmap_i[row];
2714 #endif
2715         ilen_row = imat_ilen[row];
2716         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2717         mat_i = imat_i[row];
2718         mat_a = imat_a + mat_i;
2719         mat_j = imat_j + mat_i;
2720         if (!allcolumns[i]) {
2721           for (PetscInt k = 0; k < ncols; k++) {
2722 #if defined(PETSC_USE_CTABLE)
2723             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2724 #else
2725             tcol = cmap_i[cols[k]];
2726 #endif
2727             if (tcol) {
2728               *mat_j++ = tcol - 1;
2729               *mat_a++ = vals[k];
2730               ilen_row++;
2731             }
2732           }
2733         } else { /* allcolumns */
2734           for (PetscInt k = 0; k < ncols; k++) {
2735             *mat_j++ = cols[k]; /* global col index! */
2736             *mat_a++ = vals[k];
2737             ilen_row++;
2738           }
2739         }
2740         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2741 
2742         imat_ilen[row] = ilen_row;
2743       }
2744     }
2745   }
2746 
2747   /* Now assemble the off proc rows */
2748   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2749   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2750     sbuf1_i = sbuf1[pa[tmp2]];
2751     jmax    = sbuf1_i[0];
2752     ct1     = 2 * jmax + 1;
2753     ct2     = 0;
2754     rbuf2_i = rbuf2[tmp2];
2755     rbuf3_i = rbuf3[tmp2];
2756     rbuf4_i = rbuf4[tmp2];
2757     for (PetscInt j = 1; j <= jmax; j++) {
2758       is_no  = sbuf1_i[2 * j - 1];
2759       rmap_i = rmap[is_no];
2760       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2761       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2762       imat_ilen = subc->ilen;
2763       imat_j    = subc->j;
2764       imat_i    = subc->i;
2765       imat_a    = subc->a;
2766       max1      = sbuf1_i[2 * j];
2767       for (PetscInt k = 0; k < max1; k++, ct1++) {
2768         row = sbuf1_i[ct1];
2769 #if defined(PETSC_USE_CTABLE)
2770         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2771         row--;
2772 #else
2773         row = rmap_i[row];
2774 #endif
2775         ilen  = imat_ilen[row];
2776         mat_i = imat_i[row];
2777         mat_a = PetscSafePointerPlusOffset(imat_a, mat_i);
2778         mat_j = PetscSafePointerPlusOffset(imat_j, mat_i);
2779         max2  = rbuf2_i[ct1];
2780         if (!allcolumns[is_no]) {
2781           for (PetscInt l = 0; l < max2; l++, ct2++) {
2782 #if defined(PETSC_USE_CTABLE)
2783             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2784 #else
2785             tcol = cmap_i[rbuf3_i[ct2]];
2786 #endif
2787             if (tcol) {
2788               *mat_j++ = tcol - 1;
2789               *mat_a++ = rbuf4_i[ct2];
2790               ilen++;
2791             }
2792           }
2793         } else { /* allcolumns */
2794           for (PetscInt l = 0; l < max2; l++, ct2++) {
2795             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2796             *mat_a++ = rbuf4_i[ct2];
2797             ilen++;
2798           }
2799         }
2800         imat_ilen[row] = ilen;
2801       }
2802     }
2803   }
2804 
2805   if (!iscsorted) { /* sort column indices of the rows */
2806     for (PetscInt i = 0; i < ismax; i++) {
2807       subc      = (Mat_SeqAIJ *)submats[i]->data;
2808       imat_j    = subc->j;
2809       imat_i    = subc->i;
2810       imat_a    = subc->a;
2811       imat_ilen = subc->ilen;
2812 
2813       if (allcolumns[i]) continue;
2814       jmax = nrow[i];
2815       for (PetscInt j = 0; j < jmax; j++) {
2816         mat_i = imat_i[j];
2817         mat_a = imat_a + mat_i;
2818         mat_j = imat_j + mat_i;
2819         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2820       }
2821     }
2822   }
2823 
2824   PetscCall(PetscFree(r_waits4));
2825   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2826   PetscCall(PetscFree(s_waits4));
2827 
2828   /* Restore the indices */
2829   for (PetscInt i = 0; i < ismax; i++) {
2830     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2831     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2832   }
2833 
2834   for (PetscInt i = 0; i < ismax; i++) {
2835     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2836     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2837   }
2838 
2839   /* Destroy allocated memory */
2840   if (sbuf_aa) {
2841     PetscCall(PetscFree(sbuf_aa[0]));
2842     PetscCall(PetscFree(sbuf_aa));
2843   }
2844   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2845 
2846   for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2847   PetscCall(PetscFree(rbuf4));
2848 
2849   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2850   PetscFunctionReturn(PETSC_SUCCESS);
2851 }
2852 
2853 /*
2854  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2855  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2856  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2857  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2858  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2859  state, and needs to be "assembled" later by compressing B's column space.
2860 
2861  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2862  Following this call, C->A & C->B have been created, even if empty.
2863  */
2864 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2865 {
2866   /* If making this function public, change the error returned in this function away from _PLIB. */
2867   Mat_MPIAIJ     *aij;
2868   Mat_SeqAIJ     *Baij;
2869   PetscBool       seqaij, Bdisassembled;
2870   PetscInt        m, n, *nz, ngcol, col, rstart, rend, shift, count;
2871   PetscScalar     v;
2872   const PetscInt *rowindices, *colindices;
2873 
2874   PetscFunctionBegin;
2875   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2876   if (A) {
2877     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2878     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2879     if (rowemb) {
2880       PetscCall(ISGetLocalSize(rowemb, &m));
2881       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2882     } else {
2883       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2884     }
2885     if (dcolemb) {
2886       PetscCall(ISGetLocalSize(dcolemb, &n));
2887       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2888     } else {
2889       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2890     }
2891   }
2892   if (B) {
2893     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2894     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2895     if (rowemb) {
2896       PetscCall(ISGetLocalSize(rowemb, &m));
2897       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2898     } else {
2899       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2900     }
2901     if (ocolemb) {
2902       PetscCall(ISGetLocalSize(ocolemb, &n));
2903       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2904     } else {
2905       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2906     }
2907   }
2908 
2909   aij = (Mat_MPIAIJ *)C->data;
2910   if (!aij->A) {
2911     /* Mimic parts of MatMPIAIJSetPreallocation() */
2912     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2913     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2914     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2915     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2916   }
2917   if (A) {
2918     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2919   } else {
2920     PetscCall(MatSetUp(aij->A));
2921   }
2922   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2923     /*
2924       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2925       need to "disassemble" B -- convert it to using C's global indices.
2926       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2927 
2928       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2929       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2930 
2931       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2932       At least avoid calling MatSetValues() and the implied searches?
2933     */
2934 
2935     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2936 #if defined(PETSC_USE_CTABLE)
2937       PetscCall(PetscHMapIDestroy(&aij->colmap));
2938 #else
2939       PetscCall(PetscFree(aij->colmap));
2940       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2941 #endif
2942       ngcol = 0;
2943       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2944       if (aij->garray) PetscCall(PetscFree(aij->garray));
2945       PetscCall(VecDestroy(&aij->lvec));
2946       PetscCall(VecScatterDestroy(&aij->Mvctx));
2947     }
2948     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2949     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2950   }
2951   Bdisassembled = PETSC_FALSE;
2952   if (!aij->B) {
2953     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2954     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2955     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2956     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2957     Bdisassembled = PETSC_TRUE;
2958   }
2959   if (B) {
2960     Baij = (Mat_SeqAIJ *)B->data;
2961     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2962       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2963       for (PetscInt i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2964       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2965       PetscCall(PetscFree(nz));
2966     }
2967 
2968     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2969     shift      = rend - rstart;
2970     count      = 0;
2971     rowindices = NULL;
2972     colindices = NULL;
2973     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2974     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2975     for (PetscInt i = 0; i < B->rmap->n; i++) {
2976       PetscInt row;
2977       row = i;
2978       if (rowindices) row = rowindices[i];
2979       for (PetscInt j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2980         col = Baij->j[count];
2981         if (colindices) col = colindices[col];
2982         if (Bdisassembled && col >= rstart) col += shift;
2983         v = Baij->a[count];
2984         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2985         ++count;
2986       }
2987     }
2988     /* No assembly for aij->B is necessary. */
2989     /* FIXME: set aij->B's nonzerostate correctly. */
2990   } else {
2991     PetscCall(MatSetUp(aij->B));
2992   }
2993   C->preallocated  = PETSC_TRUE;
2994   C->was_assembled = PETSC_FALSE;
2995   C->assembled     = PETSC_FALSE;
2996   /*
2997       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2998       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2999    */
3000   PetscFunctionReturn(PETSC_SUCCESS);
3001 }
3002 
3003 /*
3004   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3005  */
3006 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
3007 {
3008   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
3009 
3010   PetscFunctionBegin;
3011   PetscAssertPointer(A, 2);
3012   PetscAssertPointer(B, 3);
3013   /* FIXME: make sure C is assembled */
3014   *A = aij->A;
3015   *B = aij->B;
3016   /* Note that we don't incref *A and *B, so be careful! */
3017   PetscFunctionReturn(PETSC_SUCCESS);
3018 }
3019 
3020 /*
3021   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3022   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3023 */
3024 static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3025 {
3026   PetscMPIInt size, flag;
3027   PetscInt    cismax, ispar;
3028   Mat        *A, *B;
3029   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3030 
3031   PetscFunctionBegin;
3032   if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);
3033 
3034   cismax = 0;
3035   for (PetscInt i = 0; i < ismax; ++i) {
3036     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3037     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3038     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3039     if (size > 1) ++cismax;
3040   }
3041 
3042   /*
3043      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3044      ispar counts the number of parallel ISs across C's comm.
3045   */
3046   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3047   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3048     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3049     PetscFunctionReturn(PETSC_SUCCESS);
3050   }
3051 
3052   /* if (ispar) */
3053   /*
3054     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3055     These are used to extract the off-diag portion of the resulting parallel matrix.
3056     The row IS for the off-diag portion is the same as for the diag portion,
3057     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3058   */
3059   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3060   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3061   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3062     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3063     if (size > 1) {
3064       /*
3065          TODO: This is the part that's ***NOT SCALABLE***.
3066          To fix this we need to extract just the indices of C's nonzero columns
3067          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3068          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3069          to be done without serializing on the IS list, so, most likely, it is best
3070          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3071       */
3072       PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii]));
3073       /* Now we have to
3074          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3075              were sorted on each rank, concatenated they might no longer be sorted;
3076          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3077              indices in the nondecreasing order to the original index positions.
3078          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3079       */
3080       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3081       PetscCall(ISSort(ciscol[ii]));
3082       ++ii;
3083     }
3084   }
3085   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3086   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3087     PetscInt        issize;
3088     const PetscInt *indices;
3089 
3090     /*
3091        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3092      */
3093     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3094     PetscCall(ISSort(isrow[i]));
3095     PetscCall(ISGetLocalSize(isrow[i], &issize));
3096     PetscCall(ISGetIndices(isrow[i], &indices));
3097     for (PetscInt j = 1; j < issize; ++j) {
3098       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3099     }
3100     PetscCall(ISRestoreIndices(isrow[i], &indices));
3101     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3102     PetscCall(ISSort(iscol[i]));
3103     PetscCall(ISGetLocalSize(iscol[i], &issize));
3104     PetscCall(ISGetIndices(iscol[i], &indices));
3105     for (PetscInt j = 1; j < issize; ++j) {
3106       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3107     }
3108     PetscCall(ISRestoreIndices(iscol[i], &indices));
3109     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3110     if (size > 1) {
3111       cisrow[ii] = isrow[i];
3112       ++ii;
3113     }
3114   }
3115   /*
3116     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3117     array of sequential matrices underlying the resulting parallel matrices.
3118     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3119     contain duplicates.
3120 
3121     There are as many diag matrices as there are original index sets. There are only as many parallel
3122     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3123 
3124     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3125     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3126       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3127       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3128       with A[i] and B[ii] extracted from the corresponding MPI submat.
3129     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3130       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3131       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3132       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3133       values into A[i] and B[ii] sitting inside the corresponding submat.
3134     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3135       A[i], B[ii], AA[i] or BB[ii] matrices.
3136   */
3137   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3138   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3139 
3140   /* Now obtain the sequential A and B submatrices separately. */
3141   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3142   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3143   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3144 
3145   /*
3146     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3147     matrices A & B have been extracted directly into the parallel matrices containing them, or
3148     simply into the sequential matrix identical with the corresponding A (if size == 1).
3149     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3150     to have the same sparsity pattern.
3151     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3152     must be constructed for C. This is done by setseqmat(s).
3153   */
3154   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3155     /*
3156        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3157        That way we can avoid sorting and computing permutations when reusing.
3158        To this end:
3159         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3160         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3161           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3162     */
3163     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3164 
3165     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3166     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3167     if (size > 1) {
3168       if (scall == MAT_INITIAL_MATRIX) {
3169         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3170         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3171         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3172         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3173         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3174       }
3175       /*
3176         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3177       */
3178       {
3179         Mat AA = A[i], BB = B[ii];
3180 
3181         if (AA || BB) {
3182           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3183           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3184           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3185         }
3186         PetscCall(MatDestroy(&AA));
3187       }
3188       PetscCall(ISDestroy(ciscol + ii));
3189       PetscCall(ISDestroy(ciscol_p + ii));
3190       ++ii;
3191     } else { /* if (size == 1) */
3192       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3193       if (isrow_p[i] || iscol_p[i]) {
3194         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3195         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3196         /* Otherwise A is extracted straight into (*submats)[i]. */
3197         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3198         PetscCall(MatDestroy(A + i));
3199       } else (*submat)[i] = A[i];
3200     }
3201     PetscCall(ISDestroy(&isrow_p[i]));
3202     PetscCall(ISDestroy(&iscol_p[i]));
3203   }
3204   PetscCall(PetscFree2(cisrow, ciscol));
3205   PetscCall(PetscFree2(isrow_p, iscol_p));
3206   PetscCall(PetscFree(ciscol_p));
3207   PetscCall(PetscFree(A));
3208   PetscCall(MatDestroySubMatrices(cismax, &B));
3209   PetscFunctionReturn(PETSC_SUCCESS);
3210 }
3211 
3212 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3213 {
3214   PetscFunctionBegin;
3215   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3216   PetscFunctionReturn(PETSC_SUCCESS);
3217 }
3218