xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision ccfb0f9f40a0131988d7995ed9679700dae2a75a)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 /*
22    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
23 
24    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25    that is a very major recoding job.
26 
27    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28 */
29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30 {
31   PetscFunctionBegin;
32   for (PetscInt i = 0; i < imax; i++) {
33     if (!is[i]) break;
34     PetscInt        n = 0, N, Nmax, Nmin;
35     const PetscInt *idx;
36     PetscInt       *nidx = NULL;
37     MPI_Comm        comm;
38     PetscBT         bt;
39 
40     PetscCall(ISGetLocalSize(is[i], &N));
41     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42       PetscCall(ISGetIndices(is[i], &idx));
43       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44       Nmin = Nmin / bs;
45       Nmax = Nmax / bs;
46       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47       for (PetscInt j = 0; j < N; j++) {
48         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49       }
50       PetscCall(PetscMalloc1(n, &nidx));
51       n = 0;
52       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53       for (PetscInt j = 0; j < N; j++) {
54         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55       }
56       PetscCall(PetscBTDestroy(&bt));
57       PetscCall(ISRestoreIndices(is[i], &idx));
58     }
59     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60     PetscCall(ISDestroy(is + i));
61     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67 {
68   PetscInt i;
69 
70   PetscFunctionBegin;
71   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74   PetscFunctionReturn(PETSC_SUCCESS);
75 }
76 
77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78 {
79   PetscInt i;
80 
81   PetscFunctionBegin;
82   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89 {
90   MPI_Comm        comm;
91   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_isids, j;
92   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94   const PetscInt *indices_i, **indices;
95   PetscLayout     rmap;
96   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner, *rrow_ranks;
97   PetscSF         sf;
98   PetscSFNode    *remote;
99 
100   PetscFunctionBegin;
101   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103   PetscCallMPI(MPI_Comm_size(comm, &size));
104   /* get row map to determine where rows should be going */
105   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106   /* retrieve IS data and put all together so that we
107    * can optimize communication
108    *  */
109   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110   tlength = 0;
111   for (PetscInt i = 0; i < nidx; i++) {
112     PetscCall(ISGetLocalSize(is[i], &length[i]));
113     tlength += length[i];
114     PetscCall(ISGetIndices(is[i], &indices[i]));
115   }
116   /* find these rows on remote processors */
117   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
118   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
119   nrrows = 0;
120   for (PetscInt i = 0; i < nidx; i++) {
121     length_i  = length[i];
122     indices_i = indices[i];
123     for (PetscInt j = 0; j < length_i; j++) {
124       owner = -1;
125       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
126       /* remote processors */
127       if (owner != rank) {
128         tosizes_temp[owner]++;               /* number of rows to owner */
129         rrow_ranks[nrrows]   = owner;        /* processor */
130         rrow_isids[nrrows]   = i;            /* is id */
131         remoterows[nrrows++] = indices_i[j]; /* row */
132       }
133     }
134     PetscCall(ISRestoreIndices(is[i], &indices[i]));
135   }
136   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
137   /* test if we need to exchange messages
138    * generally speaking, we do not need to exchange
139    * data when overlap is 1
140    * */
141   PetscCallMPI(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
142   /* we do not have any messages
143    * It usually corresponds to overlap 1
144    * */
145   if (!reducednrrows) {
146     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
147     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
148     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
149     PetscFunctionReturn(PETSC_SUCCESS);
150   }
151   nto = 0;
152   /* send sizes and ranks for building a two-sided communication */
153   for (PetscMPIInt i = 0; i < size; i++) {
154     if (tosizes_temp[i]) {
155       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
156       tosizes_temp[i]  = nto;                 /* a map from processor to index */
157       toranks[nto++]   = i;                   /* MPI process */
158     }
159   }
160   PetscCall(PetscMalloc1(nto + 1, &toffsets));
161   toffsets[0] = 0;
162   for (PetscInt i = 0; i < nto; i++) {
163     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
164     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
165   }
166   /* send information to other processors */
167   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
168   nrecvrows = 0;
169   for (PetscMPIInt i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
170   PetscCall(PetscMalloc1(nrecvrows, &remote));
171   nrecvrows = 0;
172   for (PetscMPIInt i = 0; i < nfrom; i++) {
173     for (PetscInt j = 0; j < fromsizes[2 * i]; j++) {
174       remote[nrecvrows].rank    = fromranks[i];
175       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
176     }
177   }
178   PetscCall(PetscSFCreate(comm, &sf));
179   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
180   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
181   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
182   PetscCall(PetscSFSetFromOptions(sf));
183   /* message pair <no of is, row>  */
184   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
185   for (PetscInt i = 0; i < nrrows; i++) {
186     owner                 = rrow_ranks[i];       /* process */
187     j                     = tosizes_temp[owner]; /* index */
188     todata[toffsets[j]++] = rrow_isids[i];
189     todata[toffsets[j]++] = remoterows[i];
190   }
191   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
192   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
193   PetscCall(PetscFree(toffsets));
194   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
196   PetscCall(PetscSFDestroy(&sf));
197   /* send rows belonging to the remote so that then we could get the overlapping data back */
198   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
199   PetscCall(PetscFree2(todata, fromdata));
200   PetscCall(PetscFree(fromsizes));
201   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
202   PetscCall(PetscFree(fromranks));
203   nrecvrows = 0;
204   for (PetscInt i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
205   PetscCall(PetscCalloc1(nrecvrows, &todata));
206   PetscCall(PetscMalloc1(nrecvrows, &remote));
207   nrecvrows = 0;
208   for (PetscInt i = 0; i < nto; i++) {
209     for (PetscInt j = 0; j < tosizes[2 * i]; j++) {
210       remote[nrecvrows].rank    = toranks[i];
211       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
212     }
213   }
214   PetscCall(PetscSFCreate(comm, &sf));
215   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
216   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
217   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
218   PetscCall(PetscSFSetFromOptions(sf));
219   /* overlap communication and computation */
220   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
221   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
222   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
223   PetscCall(PetscSFDestroy(&sf));
224   PetscCall(PetscFree2(sbdata, sbsizes));
225   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
226   PetscCall(PetscFree(toranks));
227   PetscCall(PetscFree(tosizes));
228   PetscCall(PetscFree(todata));
229   PetscFunctionReturn(PETSC_SUCCESS);
230 }
231 
232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
233 {
234   PetscInt       *isz, isz_i, i, j, is_id, data_size;
235   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
236   const PetscInt *indices_i_temp;
237   MPI_Comm       *iscomms;
238 
239   PetscFunctionBegin;
240   max_lsize = 0;
241   PetscCall(PetscMalloc1(nidx, &isz));
242   for (i = 0; i < nidx; i++) {
243     PetscCall(ISGetLocalSize(is[i], &lsize));
244     max_lsize = lsize > max_lsize ? lsize : max_lsize;
245     isz[i]    = lsize;
246   }
247   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
248   for (i = 0; i < nidx; i++) {
249     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
250     PetscCall(ISGetIndices(is[i], &indices_i_temp));
251     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i]));
252     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
253     PetscCall(ISDestroy(&is[i]));
254   }
255   /* retrieve information to get row id and its overlap */
256   for (i = 0; i < nrecvs;) {
257     is_id     = recvdata[i++];
258     data_size = recvdata[i++];
259     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
260     isz_i     = isz[is_id];
261     for (j = 0; j < data_size; j++) {
262       col                = recvdata[i++];
263       indices_i[isz_i++] = col;
264     }
265     isz[is_id] = isz_i;
266   }
267   /* remove duplicate entities */
268   for (i = 0; i < nidx; i++) {
269     indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i);
270     isz_i     = isz[i];
271     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
272     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
273     PetscCall(PetscCommDestroy(&iscomms[i]));
274   }
275   PetscCall(PetscFree(isz));
276   PetscCall(PetscFree2(indices_temp, iscomms));
277   PetscFunctionReturn(PETSC_SUCCESS);
278 }
279 
280 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
281 {
282   PetscLayout     rmap, cmap;
283   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
284   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
285   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
286   const PetscInt *gcols, *ai, *aj, *bi, *bj;
287   Mat             amat, bmat;
288   PetscMPIInt     rank;
289   PetscBool       done;
290   MPI_Comm        comm;
291 
292   PetscFunctionBegin;
293   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
294   PetscCallMPI(MPI_Comm_rank(comm, &rank));
295   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
296   /* Even if the mat is symmetric, we still assume it is not symmetric */
297   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
298   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
299   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
300   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
301   /* total number of nonzero values is used to estimate the memory usage in the next step */
302   tnz = ai[an] + bi[bn];
303   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
304   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
305   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
306   /* to find the longest message */
307   max_fszs = 0;
308   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
309   /* better way to estimate number of nonzero in the mat??? */
310   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
311   for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i);
312   rows_pos  = 0;
313   totalrows = 0;
314   for (i = 0; i < nfrom; i++) {
315     PetscCall(PetscArrayzero(rows_pos_i, nidx));
316     /* group data together */
317     for (j = 0; j < fromsizes[2 * i]; j += 2) {
318       is_id                       = fromrows[rows_pos++]; /* no of is */
319       rows_i                      = rows_data[is_id];
320       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
321     }
322     /* estimate a space to avoid multiple allocations  */
323     for (j = 0; j < nidx; j++) {
324       indvc_ij = 0;
325       rows_i   = rows_data[j];
326       for (l = 0; l < rows_pos_i[j]; l++) {
327         row   = rows_i[l] - rstart;
328         start = ai[row];
329         end   = ai[row + 1];
330         for (k = start; k < end; k++) { /* Amat */
331           col                     = aj[k] + cstart;
332           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
333         }
334         start = bi[row];
335         end   = bi[row + 1];
336         for (k = start; k < end; k++) { /* Bmat */
337           col                     = gcols[bj[k]];
338           indices_tmp[indvc_ij++] = col;
339         }
340       }
341       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
342       indv_counts[i * nidx + j] = indvc_ij;
343       totalrows += indvc_ij;
344     }
345   }
346   /* message triple <no of is, number of rows, rows> */
347   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
348   totalrows = 0;
349   rows_pos  = 0;
350   /* use this code again */
351   for (i = 0; i < nfrom; i++) {
352     PetscCall(PetscArrayzero(rows_pos_i, nidx));
353     for (j = 0; j < fromsizes[2 * i]; j += 2) {
354       is_id                       = fromrows[rows_pos++];
355       rows_i                      = rows_data[is_id];
356       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
357     }
358     /* add data  */
359     for (j = 0; j < nidx; j++) {
360       if (!indv_counts[i * nidx + j]) continue;
361       indvc_ij            = 0;
362       sbdata[totalrows++] = j;
363       sbdata[totalrows++] = indv_counts[i * nidx + j];
364       sbsizes[2 * i] += 2;
365       rows_i = rows_data[j];
366       for (l = 0; l < rows_pos_i[j]; l++) {
367         row   = rows_i[l] - rstart;
368         start = ai[row];
369         end   = ai[row + 1];
370         for (k = start; k < end; k++) { /* Amat */
371           col                     = aj[k] + cstart;
372           indices_tmp[indvc_ij++] = col;
373         }
374         start = bi[row];
375         end   = bi[row + 1];
376         for (k = start; k < end; k++) { /* Bmat */
377           col                     = gcols[bj[k]];
378           indices_tmp[indvc_ij++] = col;
379         }
380       }
381       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
382       sbsizes[2 * i] += indvc_ij;
383       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
384       totalrows += indvc_ij;
385     }
386   }
387   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
388   offsets[0] = 0;
389   for (i = 0; i < nfrom; i++) {
390     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
391     sbsizes[2 * i + 1] = offsets[i];
392   }
393   PetscCall(PetscFree(offsets));
394   if (sbrowsizes) *sbrowsizes = sbsizes;
395   if (sbrows) *sbrows = sbdata;
396   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
397   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
398   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
403 {
404   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
405   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
406   PetscInt        lsize, lsize_tmp;
407   PetscMPIInt     rank, owner;
408   Mat             amat, bmat;
409   PetscBool       done;
410   PetscLayout     cmap, rmap;
411   MPI_Comm        comm;
412 
413   PetscFunctionBegin;
414   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
415   PetscCallMPI(MPI_Comm_rank(comm, &rank));
416   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
417   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
418   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
419   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
420   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
421   /* is it a safe way to compute number of nonzero values ? */
422   tnz = ai[an] + bi[bn];
423   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
424   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
425   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
426   /* it is a better way to estimate memory than the old implementation
427    * where global size of matrix is used
428    * */
429   PetscCall(PetscMalloc1(tnz, &indices_temp));
430   for (i = 0; i < nidx; i++) {
431     MPI_Comm iscomm;
432 
433     PetscCall(ISGetLocalSize(is[i], &lsize));
434     PetscCall(ISGetIndices(is[i], &indices));
435     lsize_tmp = 0;
436     for (j = 0; j < lsize; j++) {
437       owner = -1;
438       row   = indices[j];
439       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
440       if (owner != rank) continue;
441       /* local number */
442       row -= rstart;
443       start = ai[row];
444       end   = ai[row + 1];
445       for (k = start; k < end; k++) { /* Amat */
446         col                       = aj[k] + cstart;
447         indices_temp[lsize_tmp++] = col;
448       }
449       start = bi[row];
450       end   = bi[row + 1];
451       for (k = start; k < end; k++) { /* Bmat */
452         col                       = gcols[bj[k]];
453         indices_temp[lsize_tmp++] = col;
454       }
455     }
456     PetscCall(ISRestoreIndices(is[i], &indices));
457     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
458     PetscCall(ISDestroy(&is[i]));
459     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
460     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
461     PetscCall(PetscCommDestroy(&iscomm));
462   }
463   PetscCall(PetscFree(indices_temp));
464   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
465   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
466   PetscFunctionReturn(PETSC_SUCCESS);
467 }
468 
469 /*
470   Sample message format:
471   If a processor A wants processor B to process some elements corresponding
472   to index sets is[1],is[5]
473   mesg [0] = 2   (no of index sets in the mesg)
474   -----------
475   mesg [1] = 1 => is[1]
476   mesg [2] = sizeof(is[1]);
477   -----------
478   mesg [3] = 5  => is[5]
479   mesg [4] = sizeof(is[5]);
480   -----------
481   mesg [5]
482   mesg [n]  datas[1]
483   -----------
484   mesg[n+1]
485   mesg[m]  data(is[5])
486   -----------
487 
488   Notes:
489   nrqs - no of requests sent (or to be sent out)
490   nrqr - no of requests received (which have to be or which have been processed)
491 */
492 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
493 {
494   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
495   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
496   const PetscInt **idx, *idx_i;
497   PetscInt        *n, **data, len;
498 #if defined(PETSC_USE_CTABLE)
499   PetscHMapI *table_data, table_data_i;
500   PetscInt   *tdata, tcount, tcount_max;
501 #else
502   PetscInt *data_i, *d_p;
503 #endif
504   PetscMPIInt  size, rank, tag1, tag2, proc = 0, nrqs, *pa;
505   PetscInt     M, k, **rbuf, row, msz, **outdat, **ptr, *isz1;
506   PetscInt    *ctr, *tmp, *isz, **xdata, **rbuf2;
507   PetscBT     *table;
508   MPI_Comm     comm;
509   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
510   MPI_Status  *recv_status;
511   MPI_Comm    *iscomms;
512   char        *t_p;
513 
514   PetscFunctionBegin;
515   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
516   size = c->size;
517   rank = c->rank;
518   M    = C->rmap->N;
519 
520   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
521   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
522 
523   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
524 
525   for (PetscInt i = 0; i < imax; i++) {
526     PetscCall(ISGetIndices(is[i], &idx[i]));
527     PetscCall(ISGetLocalSize(is[i], &n[i]));
528   }
529 
530   /* evaluate communication - mesg to who,length of mesg, and buffer space
531      required. Based on this, buffers are allocated, and data copied into them  */
532   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
533   for (PetscInt i = 0; i < imax; i++) {
534     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
535     idx_i = idx[i];
536     len   = n[i];
537     for (PetscInt j = 0; j < len; j++) {
538       row = idx_i[j];
539       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
540       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
541       w4[proc]++;
542     }
543     for (PetscMPIInt j = 0; j < size; j++) {
544       if (w4[j]) {
545         w1[j] += w4[j];
546         w3[j]++;
547       }
548     }
549   }
550 
551   nrqs     = 0; /* no of outgoing messages */
552   msz      = 0; /* total mesg length (for all proc */
553   w1[rank] = 0; /* no mesg sent to intself */
554   w3[rank] = 0;
555   for (PetscMPIInt i = 0; i < size; i++) {
556     if (w1[i]) {
557       w2[i] = 1;
558       nrqs++;
559     } /* there exists a message to proc i */
560   }
561   /* pa - is list of processors to communicate with */
562   PetscCall(PetscMalloc1(nrqs, &pa));
563   for (PetscMPIInt i = 0, j = 0; i < size; i++) {
564     if (w1[i]) {
565       pa[j] = i;
566       j++;
567     }
568   }
569 
570   /* Each message would have a header = 1 + 2*(no of IS) + data */
571   for (PetscMPIInt i = 0; i < nrqs; i++) {
572     PetscMPIInt j = pa[i];
573     w1[j] += w2[j] + 2 * w3[j];
574     msz += w1[j];
575   }
576 
577   /* Determine the number of messages to expect, their lengths, from from-ids */
578   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
579   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
580 
581   /* Now post the Irecvs corresponding to these messages */
582   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
583 
584   /* Allocate Memory for outgoing messages */
585   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
586   PetscCall(PetscArrayzero(outdat, size));
587   PetscCall(PetscArrayzero(ptr, size));
588 
589   {
590     PetscInt *iptr = tmp, ict = 0;
591     for (PetscMPIInt i = 0; i < nrqs; i++) {
592       PetscMPIInt j = pa[i];
593       iptr += ict;
594       outdat[j] = iptr;
595       ict       = w1[j];
596     }
597   }
598 
599   /* Form the outgoing messages */
600   /* plug in the headers */
601   for (PetscMPIInt i = 0; i < nrqs; i++) {
602     PetscMPIInt j = pa[i];
603     outdat[j][0]  = 0;
604     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
605     ptr[j] = outdat[j] + 2 * w3[j] + 1;
606   }
607 
608   /* Memory for doing local proc's work */
609   {
610     PetscInt M_BPB_imax = 0;
611 #if defined(PETSC_USE_CTABLE)
612     PetscCall(PetscIntMultError(M / PETSC_BITS_PER_BYTE + 1, imax, &M_BPB_imax));
613     PetscCall(PetscMalloc1(imax, &table_data));
614     for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
615     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
616     for (PetscInt i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
617 #else
618     PetscInt Mimax = 0;
619     PetscCall(PetscIntMultError(M, imax, &Mimax));
620     PetscCall(PetscIntMultError(M / PETSC_BITS_PER_BYTE + 1, imax, &M_BPB_imax));
621     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
622     for (PetscInt i = 0; i < imax; i++) {
623       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
624       data[i]  = d_p + M * i;
625     }
626 #endif
627   }
628 
629   /* Parse the IS and update local tables and the outgoing buf with the data */
630   {
631     PetscInt n_i, isz_i, *outdat_j, ctr_j;
632     PetscBT  table_i;
633 
634     for (PetscInt i = 0; i < imax; i++) {
635       PetscCall(PetscArrayzero(ctr, size));
636       n_i     = n[i];
637       table_i = table[i];
638       idx_i   = idx[i];
639 #if defined(PETSC_USE_CTABLE)
640       table_data_i = table_data[i];
641 #else
642       data_i = data[i];
643 #endif
644       isz_i = isz[i];
645       for (PetscInt j = 0; j < n_i; j++) { /* parse the indices of each IS */
646         row = idx_i[j];
647         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
648         if (proc != rank) { /* copy to the outgoing buffer */
649           ctr[proc]++;
650           *ptr[proc] = row;
651           ptr[proc]++;
652         } else if (!PetscBTLookupSet(table_i, row)) {
653 #if defined(PETSC_USE_CTABLE)
654           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
655 #else
656           data_i[isz_i] = row; /* Update the local table */
657 #endif
658           isz_i++;
659         }
660       }
661       /* Update the headers for the current IS */
662       for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
663         if ((ctr_j = ctr[j])) {
664           outdat_j            = outdat[j];
665           k                   = ++outdat_j[0];
666           outdat_j[2 * k]     = ctr_j;
667           outdat_j[2 * k - 1] = i;
668         }
669       }
670       isz[i] = isz_i;
671     }
672   }
673 
674   /*  Now  post the sends */
675   PetscCall(PetscMalloc1(nrqs, &s_waits1));
676   for (PetscMPIInt i = 0; i < nrqs; ++i) {
677     PetscMPIInt j = pa[i];
678     PetscCallMPI(MPIU_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
679   }
680 
681   /* No longer need the original indices */
682   PetscCall(PetscMalloc1(imax, &iscomms));
683   for (PetscInt i = 0; i < imax; ++i) {
684     PetscCall(ISRestoreIndices(is[i], idx + i));
685     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
686   }
687   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
688 
689   for (PetscInt i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
690 
691   /* Do Local work */
692 #if defined(PETSC_USE_CTABLE)
693   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
694 #else
695   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
696 #endif
697 
698   /* Receive messages */
699   PetscCall(PetscMalloc1(nrqr, &recv_status));
700   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
701   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
702 
703   /* Phase 1 sends are complete - deallocate buffers */
704   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
705   PetscCall(PetscFree4(w1, w2, w3, w4));
706 
707   PetscCall(PetscMalloc1(nrqr, &xdata));
708   PetscCall(PetscMalloc1(nrqr, &isz1));
709   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
710   PetscCall(PetscFree(rbuf[0]));
711   PetscCall(PetscFree(rbuf));
712 
713   /* Send the data back */
714   /* Do a global reduction to know the buffer space req for incoming messages */
715   {
716     PetscMPIInt *rw1;
717 
718     PetscCall(PetscCalloc1(size, &rw1));
719     for (PetscMPIInt i = 0; i < nrqr; ++i) {
720       proc = recv_status[i].MPI_SOURCE;
721       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
722       PetscCall(PetscMPIIntCast(isz1[i], &rw1[proc]));
723     }
724     PetscCall(PetscFree(onodes1));
725     PetscCall(PetscFree(olengths1));
726 
727     /* Determine the number of messages to expect, their lengths, from from-ids */
728     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
729     PetscCall(PetscFree(rw1));
730   }
731   /* Now post the Irecvs corresponding to these messages */
732   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
733 
734   /* Now  post the sends */
735   PetscCall(PetscMalloc1(nrqr, &s_waits2));
736   for (PetscMPIInt i = 0; i < nrqr; ++i) PetscCallMPI(MPIU_Isend(xdata[i], isz1[i], MPIU_INT, recv_status[i].MPI_SOURCE, tag2, comm, s_waits2 + i));
737 
738   /* receive work done on other processors */
739   {
740     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
741     PetscMPIInt idex;
742     PetscBT     table_i;
743 
744     for (PetscMPIInt i = 0; i < nrqs; ++i) {
745       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
746       /* Process the message */
747       rbuf2_i = rbuf2[idex];
748       ct1     = 2 * rbuf2_i[0] + 1;
749       jmax    = rbuf2[idex][0];
750       for (PetscInt j = 1; j <= jmax; j++) {
751         max     = rbuf2_i[2 * j];
752         is_no   = rbuf2_i[2 * j - 1];
753         isz_i   = isz[is_no];
754         table_i = table[is_no];
755 #if defined(PETSC_USE_CTABLE)
756         table_data_i = table_data[is_no];
757 #else
758         data_i = data[is_no];
759 #endif
760         for (k = 0; k < max; k++, ct1++) {
761           row = rbuf2_i[ct1];
762           if (!PetscBTLookupSet(table_i, row)) {
763 #if defined(PETSC_USE_CTABLE)
764             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
765 #else
766             data_i[isz_i] = row;
767 #endif
768             isz_i++;
769           }
770         }
771         isz[is_no] = isz_i;
772       }
773     }
774 
775     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
776   }
777 
778 #if defined(PETSC_USE_CTABLE)
779   tcount_max = 0;
780   for (PetscInt i = 0; i < imax; ++i) {
781     table_data_i = table_data[i];
782     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
783     if (tcount_max < tcount) tcount_max = tcount;
784   }
785   PetscCall(PetscMalloc1(tcount_max, &tdata));
786 #endif
787 
788   for (PetscInt i = 0; i < imax; ++i) {
789 #if defined(PETSC_USE_CTABLE)
790     PetscHashIter tpos;
791     PetscInt      j;
792 
793     table_data_i = table_data[i];
794     PetscHashIterBegin(table_data_i, tpos);
795     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
796       PetscHashIterGetKey(table_data_i, tpos, k);
797       PetscHashIterGetVal(table_data_i, tpos, j);
798       PetscHashIterNext(table_data_i, tpos);
799       tdata[--j] = --k;
800     }
801     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
802 #else
803     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
804 #endif
805     PetscCall(PetscCommDestroy(&iscomms[i]));
806   }
807 
808   PetscCall(PetscFree(iscomms));
809   PetscCall(PetscFree(onodes2));
810   PetscCall(PetscFree(olengths2));
811 
812   PetscCall(PetscFree(pa));
813   PetscCall(PetscFree(rbuf2[0]));
814   PetscCall(PetscFree(rbuf2));
815   PetscCall(PetscFree(s_waits1));
816   PetscCall(PetscFree(r_waits1));
817   PetscCall(PetscFree(s_waits2));
818   PetscCall(PetscFree(r_waits2));
819   PetscCall(PetscFree(recv_status));
820   if (xdata) {
821     PetscCall(PetscFree(xdata[0]));
822     PetscCall(PetscFree(xdata));
823   }
824   PetscCall(PetscFree(isz1));
825 #if defined(PETSC_USE_CTABLE)
826   for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
827   PetscCall(PetscFree(table_data));
828   PetscCall(PetscFree(tdata));
829   PetscCall(PetscFree4(table, data, isz, t_p));
830 #else
831   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
832 #endif
833   PetscFunctionReturn(PETSC_SUCCESS);
834 }
835 
836 /*
837    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
838        the work on the local processor.
839 
840      Inputs:
841       C      - MAT_MPIAIJ;
842       imax - total no of index sets processed at a time;
843       table  - an array of char - size = m bits.
844 
845      Output:
846       isz    - array containing the count of the solution elements corresponding
847                to each index set;
848       data or table_data  - pointer to the solutions
849 */
850 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
851 {
852   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
853   Mat         A = c->A, B = c->B;
854   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
855   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
856   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
857   PetscBT     table_i;
858 #if defined(PETSC_USE_CTABLE)
859   PetscHMapI    table_data_i;
860   PetscHashIter tpos;
861   PetscInt      tcount, *tdata;
862 #else
863   PetscInt *data_i;
864 #endif
865 
866   PetscFunctionBegin;
867   rstart = C->rmap->rstart;
868   cstart = C->cmap->rstart;
869   ai     = a->i;
870   aj     = a->j;
871   bi     = b->i;
872   bj     = b->j;
873   garray = c->garray;
874 
875   for (i = 0; i < imax; i++) {
876 #if defined(PETSC_USE_CTABLE)
877     /* copy existing entries of table_data_i into tdata[] */
878     table_data_i = table_data[i];
879     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
880     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
881 
882     PetscCall(PetscMalloc1(tcount, &tdata));
883     PetscHashIterBegin(table_data_i, tpos);
884     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
885       PetscHashIterGetKey(table_data_i, tpos, row);
886       PetscHashIterGetVal(table_data_i, tpos, j);
887       PetscHashIterNext(table_data_i, tpos);
888       tdata[--j] = --row;
889       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
890     }
891 #else
892     data_i = data[i];
893 #endif
894     table_i = table[i];
895     isz_i   = isz[i];
896     max     = isz[i];
897 
898     for (j = 0; j < max; j++) {
899 #if defined(PETSC_USE_CTABLE)
900       row = tdata[j] - rstart;
901 #else
902       row = data_i[j] - rstart;
903 #endif
904       start = ai[row];
905       end   = ai[row + 1];
906       for (k = start; k < end; k++) { /* Amat */
907         val = aj[k] + cstart;
908         if (!PetscBTLookupSet(table_i, val)) {
909 #if defined(PETSC_USE_CTABLE)
910           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
911 #else
912           data_i[isz_i] = val;
913 #endif
914           isz_i++;
915         }
916       }
917       start = bi[row];
918       end   = bi[row + 1];
919       for (k = start; k < end; k++) { /* Bmat */
920         val = garray[bj[k]];
921         if (!PetscBTLookupSet(table_i, val)) {
922 #if defined(PETSC_USE_CTABLE)
923           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
924 #else
925           data_i[isz_i] = val;
926 #endif
927           isz_i++;
928         }
929       }
930     }
931     isz[i] = isz_i;
932 
933 #if defined(PETSC_USE_CTABLE)
934     PetscCall(PetscFree(tdata));
935 #endif
936   }
937   PetscFunctionReturn(PETSC_SUCCESS);
938 }
939 
940 /*
941       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
942          and return the output
943 
944          Input:
945            C    - the matrix
946            nrqr - no of messages being processed.
947            rbuf - an array of pointers to the received requests
948 
949          Output:
950            xdata - array of messages to be sent back
951            isz1  - size of each message
952 
953   For better efficiency perhaps we should malloc separately each xdata[i],
954 then if a remalloc is required we need only copy the data for that one row
955 rather than all previous rows as it is now where a single large chunk of
956 memory is used.
957 
958 */
959 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
960 {
961   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
962   Mat         A = c->A, B = c->B;
963   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
964   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
965   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
966   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
967   PetscInt   *rbuf_i, kmax, rbuf_0;
968   PetscBT     xtable;
969 
970   PetscFunctionBegin;
971   m      = C->rmap->N;
972   rstart = C->rmap->rstart;
973   cstart = C->cmap->rstart;
974   ai     = a->i;
975   aj     = a->j;
976   bi     = b->i;
977   bj     = b->j;
978   garray = c->garray;
979 
980   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
981     rbuf_i = rbuf[i];
982     rbuf_0 = rbuf_i[0];
983     ct += rbuf_0;
984     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
985   }
986 
987   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
988   else max1 = 1;
989   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
990   if (nrqr) {
991     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
992     ++no_malloc;
993   }
994   PetscCall(PetscBTCreate(m, &xtable));
995   PetscCall(PetscArrayzero(isz1, nrqr));
996 
997   ct3 = 0;
998   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
999     rbuf_i = rbuf[i];
1000     rbuf_0 = rbuf_i[0];
1001     ct1    = 2 * rbuf_0 + 1;
1002     ct2    = ct1;
1003     ct3 += ct1;
1004     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1005       PetscCall(PetscBTMemzero(m, xtable));
1006       oct2 = ct2;
1007       kmax = rbuf_i[2 * j];
1008       for (k = 0; k < kmax; k++, ct1++) {
1009         row = rbuf_i[ct1];
1010         if (!PetscBTLookupSet(xtable, row)) {
1011           if (!(ct3 < mem_estimate)) {
1012             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1013             PetscCall(PetscMalloc1(new_estimate, &tmp));
1014             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1015             PetscCall(PetscFree(xdata[0]));
1016             xdata[0]     = tmp;
1017             mem_estimate = new_estimate;
1018             ++no_malloc;
1019             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1020           }
1021           xdata[i][ct2++] = row;
1022           ct3++;
1023         }
1024       }
1025       for (k = oct2, max2 = ct2; k < max2; k++) {
1026         row   = xdata[i][k] - rstart;
1027         start = ai[row];
1028         end   = ai[row + 1];
1029         for (l = start; l < end; l++) {
1030           val = aj[l] + cstart;
1031           if (!PetscBTLookupSet(xtable, val)) {
1032             if (!(ct3 < mem_estimate)) {
1033               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1034               PetscCall(PetscMalloc1(new_estimate, &tmp));
1035               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1036               PetscCall(PetscFree(xdata[0]));
1037               xdata[0]     = tmp;
1038               mem_estimate = new_estimate;
1039               ++no_malloc;
1040               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1041             }
1042             xdata[i][ct2++] = val;
1043             ct3++;
1044           }
1045         }
1046         start = bi[row];
1047         end   = bi[row + 1];
1048         for (l = start; l < end; l++) {
1049           val = garray[bj[l]];
1050           if (!PetscBTLookupSet(xtable, val)) {
1051             if (!(ct3 < mem_estimate)) {
1052               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1053               PetscCall(PetscMalloc1(new_estimate, &tmp));
1054               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1055               PetscCall(PetscFree(xdata[0]));
1056               xdata[0]     = tmp;
1057               mem_estimate = new_estimate;
1058               ++no_malloc;
1059               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1060             }
1061             xdata[i][ct2++] = val;
1062             ct3++;
1063           }
1064         }
1065       }
1066       /* Update the header*/
1067       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1068       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1069     }
1070     xdata[i][0] = rbuf_0;
1071     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1072     isz1[i] = ct2; /* size of each message */
1073   }
1074   PetscCall(PetscBTDestroy(&xtable));
1075   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
1079 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1080 /*
1081     Every processor gets the entire matrix
1082 */
1083 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1084 {
1085   Mat         B;
1086   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1087   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1088   PetscMPIInt size, rank;
1089   PetscInt    sendcount, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0;
1090   PetscInt    m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1091 
1092   PetscFunctionBegin;
1093   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1094   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1095   if (scall == MAT_INITIAL_MATRIX) {
1096     /* Tell every processor the number of nonzeros per row */
1097     PetscCall(PetscCalloc1(A->rmap->N, &lens));
1098     for (PetscInt i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1099 
1100     /* All MPI processes get the same matrix */
1101     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1102 
1103     /*     Create the sequential matrix of the same type as the local block diagonal  */
1104     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1105     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1106     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1107     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1108     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1109     PetscCall(PetscCalloc1(2, Bin));
1110     **Bin = B;
1111     b     = (Mat_SeqAIJ *)B->data;
1112 
1113     /* zero column space */
1114     nrecv = 0;
1115     for (PetscMPIInt i = 0; i < size; i++) {
1116       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j];
1117     }
1118     PetscCall(PetscArrayzero(b->j, nrecv));
1119 
1120     /*   Copy my part of matrix column indices over    */
1121     sendcount  = ad->nz + bd->nz;
1122     jsendbuf   = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]);
1123     a_jsendbuf = ad->j;
1124     b_jsendbuf = bd->j;
1125     n          = A->rmap->rend - A->rmap->rstart;
1126     cnt        = 0;
1127     for (PetscInt i = 0; i < n; i++) {
1128       /* put in lower diagonal portion */
1129       m = bd->i[i + 1] - bd->i[i];
1130       while (m > 0) {
1131         /* is it above diagonal (in bd (compressed) numbering) */
1132         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1133         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1134         m--;
1135       }
1136 
1137       /* put in diagonal portion */
1138       for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1139 
1140       /* put in upper diagonal portion */
1141       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1142     }
1143     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1144 
1145     /* send column indices, b->j was zeroed */
1146     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1147 
1148     /*  Assemble the matrix into useable form (numerical values not yet set) */
1149     /* set the b->ilen (length of each row) values */
1150     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1151     /* set the b->i indices */
1152     b->i[0] = 0;
1153     for (PetscInt i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1154     PetscCall(PetscFree(lens));
1155     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1156     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1157 
1158   } else {
1159     B = **Bin;
1160     b = (Mat_SeqAIJ *)B->data;
1161   }
1162 
1163   /* Copy my part of matrix numerical values into the values location */
1164   if (flag == MAT_GET_VALUES) {
1165     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1166     MatScalar         *sendbuf;
1167 
1168     /* initialize b->a */
1169     PetscCall(PetscArrayzero(b->a, b->nz));
1170 
1171     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1172     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1173     sendcount = ad->nz + bd->nz;
1174     sendbuf   = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]);
1175     a_sendbuf = ada;
1176     b_sendbuf = bda;
1177     b_sendj   = bd->j;
1178     n         = A->rmap->rend - A->rmap->rstart;
1179     cnt       = 0;
1180     for (PetscInt i = 0; i < n; i++) {
1181       /* put in lower diagonal portion */
1182       m = bd->i[i + 1] - bd->i[i];
1183       while (m > 0) {
1184         /* is it above diagonal (in bd (compressed) numbering) */
1185         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         m--;
1188         b_sendj++;
1189       }
1190 
1191       /* put in diagonal portion */
1192       for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1193 
1194       /* put in upper diagonal portion */
1195       while (m-- > 0) {
1196         sendbuf[cnt++] = *b_sendbuf++;
1197         b_sendj++;
1198       }
1199     }
1200     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1201 
1202     /* send values, b->a was zeroed */
1203     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1204 
1205     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1206     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1207   } /* endof (flag == MAT_GET_VALUES) */
1208 
1209   PetscCall(MatPropagateSymmetryOptions(A, B));
1210   PetscFunctionReturn(PETSC_SUCCESS);
1211 }
1212 
1213 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1214 {
1215   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1216   Mat             submat, A = c->A, B = c->B;
1217   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1218   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1219   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1220   const PetscInt *icol, *irow;
1221   PetscInt        nrow, ncol, start;
1222   PetscMPIInt     rank, size, *req_source1, *req_source2, tag1, tag2, tag3, tag4, *w1, *w2, nrqr, nrqs = 0, proc, *pa;
1223   PetscInt      **sbuf1, **sbuf2, k, ct1, ct2, ct3, **rbuf1, row;
1224   PetscInt        msz, **ptr, *req_size, *ctr, *tmp, tcol, *iptr;
1225   PetscInt      **rbuf3, **sbuf_aj, **rbuf2, max1, nnz;
1226   PetscInt       *lens, rmax, ncols, *cols, Crow;
1227 #if defined(PETSC_USE_CTABLE)
1228   PetscHMapI cmap, rmap;
1229   PetscInt  *cmap_loc, *rmap_loc;
1230 #else
1231   PetscInt *cmap, *rmap;
1232 #endif
1233   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i, jcnt;
1234   PetscInt     *cworkB, lwrite, *subcols, ib, jb;
1235   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1236   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1237   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1238   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1239   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1240   MPI_Comm      comm;
1241   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1242   PetscMPIInt  *onodes1, *olengths1, idex, end, *row2proc;
1243   Mat_SubSppt  *smatis1;
1244   PetscBool     isrowsorted, iscolsorted;
1245 
1246   PetscFunctionBegin;
1247   PetscValidLogicalCollectiveInt(C, ismax, 2);
1248   PetscValidLogicalCollectiveEnum(C, scall, 5);
1249   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1250   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1251   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1252   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1253   size = c->size;
1254   rank = c->rank;
1255 
1256   PetscCall(ISSorted(iscol[0], &iscolsorted));
1257   PetscCall(ISSorted(isrow[0], &isrowsorted));
1258   PetscCall(ISGetIndices(isrow[0], &irow));
1259   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1260   if (allcolumns) {
1261     icol = NULL;
1262     ncol = C->cmap->N;
1263   } else {
1264     PetscCall(ISGetIndices(iscol[0], &icol));
1265     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1266   }
1267 
1268   if (scall == MAT_INITIAL_MATRIX) {
1269     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1270 
1271     /* Get some new tags to keep the communication clean */
1272     tag1 = ((PetscObject)C)->tag;
1273     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1274     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1275 
1276     /* evaluate communication - mesg to who, length of mesg, and buffer space
1277      required. Based on this, buffers are allocated, and data copied into them */
1278     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1279     PetscCall(PetscMalloc1(nrow, &row2proc));
1280 
1281     /* w1[proc] = num of rows owned by proc -- to be requested */
1282     proc = 0;
1283     nrqs = 0; /* num of outgoing messages */
1284     for (PetscInt j = 0; j < nrow; j++) {
1285       row = irow[j];
1286       if (!isrowsorted) proc = 0;
1287       while (row >= C->rmap->range[proc + 1]) proc++;
1288       w1[proc]++;
1289       row2proc[j] = proc; /* map row index to proc */
1290 
1291       if (proc != rank && !w2[proc]) {
1292         w2[proc] = 1;
1293         nrqs++;
1294       }
1295     }
1296     w1[rank] = 0; /* rows owned by self will not be requested */
1297 
1298     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1299     for (PetscMPIInt proc = 0, j = 0; proc < size; proc++) {
1300       if (w1[proc]) pa[j++] = proc;
1301     }
1302 
1303     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1304     msz = 0; /* total mesg length (for all procs) */
1305     for (PetscMPIInt i = 0; i < nrqs; i++) {
1306       w1[pa[i]] += 3;
1307       msz += w1[pa[i]];
1308     }
1309     PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));
1310 
1311     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1312     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1313     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1314 
1315     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1316        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1317     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1318 
1319     /* Now post the Irecvs corresponding to these messages */
1320     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1321 
1322     PetscCall(PetscFree(onodes1));
1323     PetscCall(PetscFree(olengths1));
1324 
1325     /* Allocate Memory for outgoing messages */
1326     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1327     PetscCall(PetscArrayzero(sbuf1, size));
1328     PetscCall(PetscArrayzero(ptr, size));
1329 
1330     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1331     iptr = tmp;
1332     for (PetscMPIInt i = 0; i < nrqs; i++) {
1333       sbuf1[pa[i]] = iptr;
1334       iptr += w1[pa[i]];
1335     }
1336 
1337     /* Form the outgoing messages */
1338     /* Initialize the header space */
1339     for (PetscMPIInt i = 0; i < nrqs; i++) {
1340       PetscCall(PetscArrayzero(sbuf1[pa[i]], 3));
1341       ptr[pa[i]] = sbuf1[pa[i]] + 3;
1342     }
1343 
1344     /* Parse the isrow and copy data into outbuf */
1345     PetscCall(PetscArrayzero(ctr, size));
1346     for (PetscInt j = 0; j < nrow; j++) { /* parse the indices of each IS */
1347       if (row2proc[j] != rank) {          /* copy to the outgoing buf */
1348         *ptr[row2proc[j]] = irow[j];
1349         ctr[row2proc[j]]++;
1350         ptr[row2proc[j]]++;
1351       }
1352     }
1353 
1354     /* Update the headers for the current IS */
1355     for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
1356       if ((ctr_j = ctr[j])) {
1357         sbuf1_j            = sbuf1[j];
1358         k                  = ++sbuf1_j[0];
1359         sbuf1_j[2 * k]     = ctr_j;
1360         sbuf1_j[2 * k - 1] = 0;
1361       }
1362     }
1363 
1364     /* Now post the sends */
1365     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1366     for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCallMPI(MPIU_Isend(sbuf1[pa[i]], w1[pa[i]], MPIU_INT, pa[i], tag1, comm, s_waits1 + i));
1367 
1368     /* Post Receives to capture the buffer size */
1369     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1370     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1371 
1372     if (nrqs) rbuf2[0] = tmp + msz;
1373     for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1374 
1375     for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[pa[i]], MPIU_INT, pa[i], tag2, comm, r_waits2 + i));
1376 
1377     PetscCall(PetscFree2(w1, w2));
1378 
1379     /* Send to other procs the buf size they should allocate */
1380     /* Receive messages*/
1381     PetscCall(PetscMalloc1(nrqr, &r_status1));
1382     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1383 
1384     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1385     for (PetscMPIInt i = 0; i < nrqr; ++i) {
1386       req_size[i] = 0;
1387       rbuf1_i     = rbuf1[i];
1388       start       = 2 * rbuf1_i[0] + 1;
1389       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1390       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1391       sbuf2_i = sbuf2[i];
1392       for (PetscInt j = start; j < end; j++) {
1393         k          = rbuf1_i[j] - rstart;
1394         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1395         sbuf2_i[j] = ncols;
1396         req_size[i] += ncols;
1397       }
1398       req_source1[i] = r_status1[i].MPI_SOURCE;
1399 
1400       /* form the header */
1401       sbuf2_i[0] = req_size[i];
1402       for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1403 
1404       PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1405     }
1406 
1407     PetscCall(PetscFree(r_status1));
1408     PetscCall(PetscFree(r_waits1));
1409 
1410     /* rbuf2 is received, Post recv column indices a->j */
1411     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1412 
1413     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1414     for (PetscMPIInt i = 0; i < nrqs; ++i) {
1415       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1416       req_source2[i] = r_status2[i].MPI_SOURCE;
1417       PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1418     }
1419 
1420     /* Wait on sends1 and sends2 */
1421     PetscCall(PetscMalloc1(nrqs, &s_status1));
1422     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1423     PetscCall(PetscFree(s_waits1));
1424     PetscCall(PetscFree(s_status1));
1425 
1426     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1427     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1428 
1429     /* Now allocate sending buffers for a->j, and send them off */
1430     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1431     jcnt = 0;
1432     for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1433     if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
1434     for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1435 
1436     for (PetscMPIInt i = 0; i < nrqr; i++) { /* for each requested message */
1437       rbuf1_i   = rbuf1[i];
1438       sbuf_aj_i = sbuf_aj[i];
1439       ct1       = 2 * rbuf1_i[0] + 1;
1440       ct2       = 0;
1441       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1442 
1443       kmax = rbuf1[i][2];
1444       for (PetscInt k = 0; k < kmax; k++, ct1++) { /* for each row */
1445         row    = rbuf1_i[ct1] - rstart;
1446         nzA    = ai[row + 1] - ai[row];
1447         nzB    = bi[row + 1] - bi[row];
1448         ncols  = nzA + nzB;
1449         cworkA = PetscSafePointerPlusOffset(aj, ai[row]);
1450         cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1451 
1452         /* load the column indices for this row into cols*/
1453         cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2);
1454 
1455         lwrite = 0;
1456         for (PetscInt l = 0; l < nzB; l++) {
1457           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1458         }
1459         for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1460         for (PetscInt l = 0; l < nzB; l++) {
1461           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1462         }
1463 
1464         ct2 += ncols;
1465       }
1466       PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1467     }
1468 
1469     /* create column map (cmap): global col of C -> local col of submat */
1470 #if defined(PETSC_USE_CTABLE)
1471     if (!allcolumns) {
1472       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1473       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1474       for (PetscInt j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1475         if (icol[j] >= cstart && icol[j] < cend) {
1476           cmap_loc[icol[j] - cstart] = j + 1;
1477         } else { /* use PetscHMapI for non-local col indices */
1478           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1479         }
1480       }
1481     } else {
1482       cmap     = NULL;
1483       cmap_loc = NULL;
1484     }
1485     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1486 #else
1487     if (!allcolumns) {
1488       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1489       for (PetscInt j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1490     } else {
1491       cmap = NULL;
1492     }
1493 #endif
1494 
1495     /* Create lens for MatSeqAIJSetPreallocation() */
1496     PetscCall(PetscCalloc1(nrow, &lens));
1497 
1498     /* Compute lens from local part of C */
1499     for (PetscInt j = 0; j < nrow; j++) {
1500       row = irow[j];
1501       if (row2proc[j] == rank) {
1502         /* diagonal part A = c->A */
1503         ncols = ai[row - rstart + 1] - ai[row - rstart];
1504         cols  = PetscSafePointerPlusOffset(aj, ai[row - rstart]);
1505         if (!allcolumns) {
1506           for (PetscInt k = 0; k < ncols; k++) {
1507 #if defined(PETSC_USE_CTABLE)
1508             tcol = cmap_loc[cols[k]];
1509 #else
1510             tcol = cmap[cols[k] + cstart];
1511 #endif
1512             if (tcol) lens[j]++;
1513           }
1514         } else { /* allcolumns */
1515           lens[j] = ncols;
1516         }
1517 
1518         /* off-diagonal part B = c->B */
1519         ncols = bi[row - rstart + 1] - bi[row - rstart];
1520         cols  = PetscSafePointerPlusOffset(bj, bi[row - rstart]);
1521         if (!allcolumns) {
1522           for (PetscInt k = 0; k < ncols; k++) {
1523 #if defined(PETSC_USE_CTABLE)
1524             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1525 #else
1526             tcol = cmap[bmap[cols[k]]];
1527 #endif
1528             if (tcol) lens[j]++;
1529           }
1530         } else { /* allcolumns */
1531           lens[j] += ncols;
1532         }
1533       }
1534     }
1535 
1536     /* Create row map (rmap): global row of C -> local row of submat */
1537 #if defined(PETSC_USE_CTABLE)
1538     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1539     for (PetscInt j = 0; j < nrow; j++) {
1540       row = irow[j];
1541       if (row2proc[j] == rank) { /* a local row */
1542         rmap_loc[row - rstart] = j;
1543       } else {
1544         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1545       }
1546     }
1547 #else
1548     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1549     for (PetscInt j = 0; j < nrow; j++) rmap[irow[j]] = j;
1550 #endif
1551 
1552     /* Update lens from offproc data */
1553     /* recv a->j is done */
1554     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1555     for (PetscMPIInt i = 0; i < nrqs; i++) {
1556       sbuf1_i = sbuf1[pa[i]];
1557       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1558       ct1     = 2 + 1;
1559       ct2     = 0;
1560       rbuf2_i = rbuf2[i]; /* received length of C->j */
1561       rbuf3_i = rbuf3[i]; /* received C->j */
1562 
1563       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1564       max1 = sbuf1_i[2];
1565       for (PetscInt k = 0; k < max1; k++, ct1++) {
1566 #if defined(PETSC_USE_CTABLE)
1567         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1568         row--;
1569         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1570 #else
1571         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1572 #endif
1573         /* Now, store row index of submat in sbuf1_i[ct1] */
1574         sbuf1_i[ct1] = row;
1575 
1576         nnz = rbuf2_i[ct1];
1577         if (!allcolumns) {
1578           for (PetscMPIInt l = 0; l < nnz; l++, ct2++) {
1579 #if defined(PETSC_USE_CTABLE)
1580             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1581               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1582             } else {
1583               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1584             }
1585 #else
1586             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1587 #endif
1588             if (tcol) lens[row]++;
1589           }
1590         } else { /* allcolumns */
1591           lens[row] += nnz;
1592         }
1593       }
1594     }
1595     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1596     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1597 
1598     /* Create the submatrices */
1599     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1600     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1601 
1602     PetscCall(ISGetBlockSize(isrow[0], &ib));
1603     PetscCall(ISGetBlockSize(iscol[0], &jb));
1604     if (ib > 1 || jb > 1) PetscCall(MatSetBlockSizes(submat, ib, jb));
1605     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1606     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1607 
1608     /* create struct Mat_SubSppt and attached it to submat */
1609     PetscCall(PetscNew(&smatis1));
1610     subc            = (Mat_SeqAIJ *)submat->data;
1611     subc->submatis1 = smatis1;
1612 
1613     smatis1->id          = 0;
1614     smatis1->nrqs        = nrqs;
1615     smatis1->nrqr        = nrqr;
1616     smatis1->rbuf1       = rbuf1;
1617     smatis1->rbuf2       = rbuf2;
1618     smatis1->rbuf3       = rbuf3;
1619     smatis1->sbuf2       = sbuf2;
1620     smatis1->req_source2 = req_source2;
1621 
1622     smatis1->sbuf1 = sbuf1;
1623     smatis1->ptr   = ptr;
1624     smatis1->tmp   = tmp;
1625     smatis1->ctr   = ctr;
1626 
1627     smatis1->pa          = pa;
1628     smatis1->req_size    = req_size;
1629     smatis1->req_source1 = req_source1;
1630 
1631     smatis1->allcolumns = allcolumns;
1632     smatis1->singleis   = PETSC_TRUE;
1633     smatis1->row2proc   = row2proc;
1634     smatis1->rmap       = rmap;
1635     smatis1->cmap       = cmap;
1636 #if defined(PETSC_USE_CTABLE)
1637     smatis1->rmap_loc = rmap_loc;
1638     smatis1->cmap_loc = cmap_loc;
1639 #endif
1640 
1641     smatis1->destroy     = submat->ops->destroy;
1642     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1643     submat->factortype   = C->factortype;
1644 
1645     /* compute rmax */
1646     rmax = 0;
1647     for (PetscMPIInt i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1648 
1649   } else { /* scall == MAT_REUSE_MATRIX */
1650     submat = submats[0];
1651     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1652 
1653     subc        = (Mat_SeqAIJ *)submat->data;
1654     rmax        = subc->rmax;
1655     smatis1     = subc->submatis1;
1656     nrqs        = smatis1->nrqs;
1657     nrqr        = smatis1->nrqr;
1658     rbuf1       = smatis1->rbuf1;
1659     rbuf2       = smatis1->rbuf2;
1660     rbuf3       = smatis1->rbuf3;
1661     req_source2 = smatis1->req_source2;
1662 
1663     sbuf1 = smatis1->sbuf1;
1664     sbuf2 = smatis1->sbuf2;
1665     ptr   = smatis1->ptr;
1666     tmp   = smatis1->tmp;
1667     ctr   = smatis1->ctr;
1668 
1669     pa          = smatis1->pa;
1670     req_size    = smatis1->req_size;
1671     req_source1 = smatis1->req_source1;
1672 
1673     allcolumns = smatis1->allcolumns;
1674     row2proc   = smatis1->row2proc;
1675     rmap       = smatis1->rmap;
1676     cmap       = smatis1->cmap;
1677 #if defined(PETSC_USE_CTABLE)
1678     rmap_loc = smatis1->rmap_loc;
1679     cmap_loc = smatis1->cmap_loc;
1680 #endif
1681   }
1682 
1683   /* Post recv matrix values */
1684   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1685   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1686   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1687   for (PetscMPIInt i = 0; i < nrqs; ++i) {
1688     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1689     PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1690   }
1691 
1692   /* Allocate sending buffers for a->a, and send them off */
1693   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1694   jcnt = 0;
1695   for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1696   if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
1697   for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1698 
1699   for (PetscMPIInt i = 0; i < nrqr; i++) {
1700     rbuf1_i   = rbuf1[i];
1701     sbuf_aa_i = sbuf_aa[i];
1702     ct1       = 2 * rbuf1_i[0] + 1;
1703     ct2       = 0;
1704     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1705 
1706     kmax = rbuf1_i[2];
1707     for (PetscInt k = 0; k < kmax; k++, ct1++) {
1708       row    = rbuf1_i[ct1] - rstart;
1709       nzA    = ai[row + 1] - ai[row];
1710       nzB    = bi[row + 1] - bi[row];
1711       ncols  = nzA + nzB;
1712       cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1713       vworkA = PetscSafePointerPlusOffset(a_a, ai[row]);
1714       vworkB = PetscSafePointerPlusOffset(b_a, bi[row]);
1715 
1716       /* load the column values for this row into vals*/
1717       vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2);
1718 
1719       lwrite = 0;
1720       for (PetscInt l = 0; l < nzB; l++) {
1721         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1722       }
1723       for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1724       for (PetscInt l = 0; l < nzB; l++) {
1725         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1726       }
1727 
1728       ct2 += ncols;
1729     }
1730     PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1731   }
1732 
1733   /* Assemble submat */
1734   /* First assemble the local rows */
1735   for (PetscInt j = 0; j < nrow; j++) {
1736     row = irow[j];
1737     if (row2proc[j] == rank) {
1738       Crow = row - rstart; /* local row index of C */
1739 #if defined(PETSC_USE_CTABLE)
1740       row = rmap_loc[Crow]; /* row index of submat */
1741 #else
1742       row = rmap[row];
1743 #endif
1744 
1745       if (allcolumns) {
1746         PetscInt ncol = 0;
1747 
1748         /* diagonal part A = c->A */
1749         ncols = ai[Crow + 1] - ai[Crow];
1750         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1751         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1752         for (PetscInt k = 0; k < ncols; k++) {
1753           subcols[ncol]   = cols[k] + cstart;
1754           subvals[ncol++] = vals[k];
1755         }
1756 
1757         /* off-diagonal part B = c->B */
1758         ncols = bi[Crow + 1] - bi[Crow];
1759         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1760         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1761         for (PetscInt k = 0; k < ncols; k++) {
1762           subcols[ncol]   = bmap[cols[k]];
1763           subvals[ncol++] = vals[k];
1764         }
1765 
1766         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));
1767 
1768       } else { /* !allcolumns */
1769         PetscInt ncol = 0;
1770 
1771 #if defined(PETSC_USE_CTABLE)
1772         /* diagonal part A = c->A */
1773         ncols = ai[Crow + 1] - ai[Crow];
1774         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1775         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1776         for (PetscInt k = 0; k < ncols; k++) {
1777           tcol = cmap_loc[cols[k]];
1778           if (tcol) {
1779             subcols[ncol]   = --tcol;
1780             subvals[ncol++] = vals[k];
1781           }
1782         }
1783 
1784         /* off-diagonal part B = c->B */
1785         ncols = bi[Crow + 1] - bi[Crow];
1786         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1787         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1788         for (PetscInt k = 0; k < ncols; k++) {
1789           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1790           if (tcol) {
1791             subcols[ncol]   = --tcol;
1792             subvals[ncol++] = vals[k];
1793           }
1794         }
1795 #else
1796         /* diagonal part A = c->A */
1797         ncols = ai[Crow + 1] - ai[Crow];
1798         cols  = aj + ai[Crow];
1799         vals  = a_a + ai[Crow];
1800         for (PetscInt k = 0; k < ncols; k++) {
1801           tcol = cmap[cols[k] + cstart];
1802           if (tcol) {
1803             subcols[ncol]   = --tcol;
1804             subvals[ncol++] = vals[k];
1805           }
1806         }
1807 
1808         /* off-diagonal part B = c->B */
1809         ncols = bi[Crow + 1] - bi[Crow];
1810         cols  = bj + bi[Crow];
1811         vals  = b_a + bi[Crow];
1812         for (PetscInt k = 0; k < ncols; k++) {
1813           tcol = cmap[bmap[cols[k]]];
1814           if (tcol) {
1815             subcols[ncol]   = --tcol;
1816             subvals[ncol++] = vals[k];
1817           }
1818         }
1819 #endif
1820         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));
1821       }
1822     }
1823   }
1824 
1825   /* Now assemble the off-proc rows */
1826   for (PetscMPIInt i = 0; i < nrqs; i++) { /* for each requested message */
1827     /* recv values from other processes */
1828     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1829     sbuf1_i = sbuf1[pa[idex]];
1830     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1831     ct1     = 2 + 1;
1832     ct2     = 0;           /* count of received C->j */
1833     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1834     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1835     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1836     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1837 
1838     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1839     max1 = sbuf1_i[2];                           /* num of rows */
1840     for (PetscInt k = 0; k < max1; k++, ct1++) { /* for each recved row */
1841       row = sbuf1_i[ct1];                        /* row index of submat */
1842       if (!allcolumns) {
1843         idex = 0;
1844         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1845           nnz = rbuf2_i[ct1];                         /* num of C entries in this row */
1846           for (PetscInt l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1847 #if defined(PETSC_USE_CTABLE)
1848             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1849               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1850             } else {
1851               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1852             }
1853 #else
1854             tcol = cmap[rbuf3_i[ct2]];
1855 #endif
1856             if (tcol) {
1857               subcols[idex]   = --tcol; /* may not be sorted */
1858               subvals[idex++] = rbuf4_i[ct2];
1859 
1860               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1861                For reuse, we replace received C->j with index that should be inserted to submat */
1862               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1863             }
1864           }
1865           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1866         } else { /* scall == MAT_REUSE_MATRIX */
1867           submat = submats[0];
1868           subc   = (Mat_SeqAIJ *)submat->data;
1869 
1870           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1871           for (PetscInt l = 0; l < nnz; l++) {
1872             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1873             subvals[idex++] = rbuf4_i[ct2];
1874           }
1875 
1876           bj = subc->j + subc->i[row]; /* sorted column indices */
1877           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1878         }
1879       } else {              /* allcolumns */
1880         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1881         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES));
1882         ct2 += nnz;
1883       }
1884     }
1885   }
1886 
1887   /* sending a->a are done */
1888   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1889   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1890 
1891   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1892   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1893   submats[0] = submat;
1894 
1895   /* Restore the indices */
1896   PetscCall(ISRestoreIndices(isrow[0], &irow));
1897   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1898 
1899   /* Destroy allocated memory */
1900   for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1901   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1902   if (sbuf_aa) {
1903     PetscCall(PetscFree(sbuf_aa[0]));
1904     PetscCall(PetscFree(sbuf_aa));
1905   }
1906 
1907   if (scall == MAT_INITIAL_MATRIX) {
1908     PetscCall(PetscFree(lens));
1909     if (sbuf_aj) {
1910       PetscCall(PetscFree(sbuf_aj[0]));
1911       PetscCall(PetscFree(sbuf_aj));
1912     }
1913   }
1914   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1915   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1916   PetscFunctionReturn(PETSC_SUCCESS);
1917 }
1918 
1919 static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1920 {
1921   PetscInt  ncol;
1922   PetscBool colflag, allcolumns = PETSC_FALSE;
1923 
1924   PetscFunctionBegin;
1925   /* Allocate memory to hold all the submatrices */
1926   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1927 
1928   /* Check for special case: each processor gets entire matrix columns */
1929   PetscCall(ISIdentity(iscol[0], &colflag));
1930   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1931   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1932 
1933   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1934   PetscFunctionReturn(PETSC_SUCCESS);
1935 }
1936 
1937 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1938 {
1939   PetscInt     nmax, nstages = 0, max_no, nrow, ncol, in[2], out[2];
1940   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1941   Mat_SeqAIJ  *subc;
1942   Mat_SubSppt *smat;
1943 
1944   PetscFunctionBegin;
1945   /* Check for special case: each processor has a single IS */
1946   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1947     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1948     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1949     PetscFunctionReturn(PETSC_SUCCESS);
1950   }
1951 
1952   /* Collect global wantallmatrix and nstages */
1953   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1954   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1955   if (!nmax) nmax = 1;
1956 
1957   if (scall == MAT_INITIAL_MATRIX) {
1958     /* Collect global wantallmatrix and nstages */
1959     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1960       PetscCall(ISIdentity(*isrow, &rowflag));
1961       PetscCall(ISIdentity(*iscol, &colflag));
1962       PetscCall(ISGetLocalSize(*isrow, &nrow));
1963       PetscCall(ISGetLocalSize(*iscol, &ncol));
1964       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1965         wantallmatrix = PETSC_TRUE;
1966 
1967         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1968       }
1969     }
1970 
1971     /* Determine the number of stages through which submatrices are done
1972        Each stage will extract nmax submatrices.
1973        nmax is determined by the matrix column dimension.
1974        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1975     */
1976     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1977 
1978     in[0] = -1 * (PetscInt)wantallmatrix;
1979     in[1] = nstages;
1980     PetscCallMPI(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1981     wantallmatrix = (PetscBool)(-out[0]);
1982     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1983 
1984   } else { /* MAT_REUSE_MATRIX */
1985     if (ismax) {
1986       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
1987       smat = subc->submatis1;
1988     } else { /* (*submat)[0] is a dummy matrix */
1989       smat = (Mat_SubSppt *)(*submat)[0]->data;
1990     }
1991     if (!smat) {
1992       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1993       wantallmatrix = PETSC_TRUE;
1994     } else if (smat->singleis) {
1995       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1996       PetscFunctionReturn(PETSC_SUCCESS);
1997     } else {
1998       nstages = smat->nstages;
1999     }
2000   }
2001 
2002   if (wantallmatrix) {
2003     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2004     PetscFunctionReturn(PETSC_SUCCESS);
2005   }
2006 
2007   /* Allocate memory to hold all the submatrices and dummy submatrices */
2008   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2009 
2010   for (PetscInt i = 0, pos = 0; i < nstages; i++) {
2011     if (pos + nmax <= ismax) max_no = nmax;
2012     else if (pos >= ismax) max_no = 0;
2013     else max_no = ismax - pos;
2014 
2015     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos));
2016     if (!max_no) {
2017       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2018         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2019         smat->nstages = nstages;
2020       }
2021       pos++; /* advance to next dummy matrix if any */
2022     } else pos += max_no;
2023   }
2024 
2025   if (ismax && scall == MAT_INITIAL_MATRIX) {
2026     /* save nstages for reuse */
2027     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2028     smat          = subc->submatis1;
2029     smat->nstages = nstages;
2030   }
2031   PetscFunctionReturn(PETSC_SUCCESS);
2032 }
2033 
2034 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2035 {
2036   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2037   Mat              A = c->A;
2038   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2039   const PetscInt **icol, **irow;
2040   PetscInt        *nrow, *ncol, start;
2041   PetscMPIInt      nrqs = 0, *pa, proc = -1;
2042   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr, *req_source1 = NULL, *req_source2;
2043   PetscInt       **sbuf1, **sbuf2, k, ct1, ct2, **rbuf1, row;
2044   PetscInt         msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *tmp = NULL, tcol;
2045   PetscInt       **rbuf3 = NULL, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2046   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2047 #if defined(PETSC_USE_CTABLE)
2048   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2049 #else
2050   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2051 #endif
2052   const PetscInt *irow_i;
2053   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2054   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2055   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2056   MPI_Comm        comm;
2057   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2058   PetscMPIInt    *onodes1, *olengths1, end, **row2proc, *row2proc_i;
2059   PetscInt        ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2060   Mat_SubSppt    *smat_i;
2061   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2062   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen, jcnt;
2063 
2064   PetscFunctionBegin;
2065   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2066   size = c->size;
2067   rank = c->rank;
2068 
2069   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2070   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2071 
2072   for (PetscInt i = 0; i < ismax; i++) {
2073     PetscCall(ISSorted(iscol[i], &issorted[i]));
2074     if (!issorted[i]) iscsorted = issorted[i];
2075 
2076     PetscCall(ISSorted(isrow[i], &issorted[i]));
2077 
2078     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2079     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2080 
2081     /* Check for special case: allcolumn */
2082     PetscCall(ISIdentity(iscol[i], &colflag));
2083     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2084     if (colflag && ncol[i] == C->cmap->N) {
2085       allcolumns[i] = PETSC_TRUE;
2086       icol[i]       = NULL;
2087     } else {
2088       allcolumns[i] = PETSC_FALSE;
2089       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2090     }
2091   }
2092 
2093   if (scall == MAT_REUSE_MATRIX) {
2094     /* Assumes new rows are same length as the old rows */
2095     for (PetscInt i = 0; i < ismax; i++) {
2096       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2097       subc = (Mat_SeqAIJ *)submats[i]->data;
2098       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2099 
2100       /* Initial matrix as if empty */
2101       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2102 
2103       smat_i = subc->submatis1;
2104 
2105       nrqs        = smat_i->nrqs;
2106       nrqr        = smat_i->nrqr;
2107       rbuf1       = smat_i->rbuf1;
2108       rbuf2       = smat_i->rbuf2;
2109       rbuf3       = smat_i->rbuf3;
2110       req_source2 = smat_i->req_source2;
2111 
2112       sbuf1 = smat_i->sbuf1;
2113       sbuf2 = smat_i->sbuf2;
2114       ptr   = smat_i->ptr;
2115       tmp   = smat_i->tmp;
2116       ctr   = smat_i->ctr;
2117 
2118       pa          = smat_i->pa;
2119       req_size    = smat_i->req_size;
2120       req_source1 = smat_i->req_source1;
2121 
2122       allcolumns[i] = smat_i->allcolumns;
2123       row2proc[i]   = smat_i->row2proc;
2124       rmap[i]       = smat_i->rmap;
2125       cmap[i]       = smat_i->cmap;
2126     }
2127 
2128     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2129       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2130       smat_i = (Mat_SubSppt *)submats[0]->data;
2131 
2132       nrqs        = smat_i->nrqs;
2133       nrqr        = smat_i->nrqr;
2134       rbuf1       = smat_i->rbuf1;
2135       rbuf2       = smat_i->rbuf2;
2136       rbuf3       = smat_i->rbuf3;
2137       req_source2 = smat_i->req_source2;
2138 
2139       sbuf1 = smat_i->sbuf1;
2140       sbuf2 = smat_i->sbuf2;
2141       ptr   = smat_i->ptr;
2142       tmp   = smat_i->tmp;
2143       ctr   = smat_i->ctr;
2144 
2145       pa          = smat_i->pa;
2146       req_size    = smat_i->req_size;
2147       req_source1 = smat_i->req_source1;
2148 
2149       allcolumns[0] = PETSC_FALSE;
2150     }
2151   } else { /* scall == MAT_INITIAL_MATRIX */
2152     /* Get some new tags to keep the communication clean */
2153     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2154     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2155 
2156     /* evaluate communication - mesg to who, length of mesg, and buffer space
2157      required. Based on this, buffers are allocated, and data copied into them*/
2158     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2159 
2160     for (PetscInt i = 0; i < ismax; i++) {
2161       jmax   = nrow[i];
2162       irow_i = irow[i];
2163 
2164       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2165       row2proc[i] = row2proc_i;
2166 
2167       if (issorted[i]) proc = 0;
2168       for (PetscInt j = 0; j < jmax; j++) {
2169         if (!issorted[i]) proc = 0;
2170         row = irow_i[j];
2171         while (row >= C->rmap->range[proc + 1]) proc++;
2172         w4[proc]++;
2173         row2proc_i[j] = proc; /* map row index to proc */
2174       }
2175       for (PetscMPIInt j = 0; j < size; j++) {
2176         if (w4[j]) {
2177           w1[j] += w4[j];
2178           w3[j]++;
2179           w4[j] = 0;
2180         }
2181       }
2182     }
2183 
2184     nrqs     = 0; /* no of outgoing messages */
2185     msz      = 0; /* total mesg length (for all procs) */
2186     w1[rank] = 0; /* no mesg sent to self */
2187     w3[rank] = 0;
2188     for (PetscMPIInt i = 0; i < size; i++) {
2189       if (w1[i]) {
2190         w2[i] = 1;
2191         nrqs++;
2192       } /* there exists a message to proc i */
2193     }
2194     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2195     for (PetscMPIInt i = 0, j = 0; i < size; i++) {
2196       if (w1[i]) {
2197         pa[j] = i;
2198         j++;
2199       }
2200     }
2201 
2202     /* Each message would have a header = 1 + 2*(no of IS) + data */
2203     for (PetscMPIInt i = 0; i < nrqs; i++) {
2204       PetscMPIInt j = pa[i];
2205       w1[j] += w2[j] + 2 * w3[j];
2206       msz += w1[j];
2207     }
2208     PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));
2209 
2210     /* Determine the number of messages to expect, their lengths, from from-ids */
2211     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2212     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2213 
2214     /* Now post the Irecvs corresponding to these messages */
2215     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2216     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2217 
2218     /* Allocate Memory for outgoing messages */
2219     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2220     PetscCall(PetscArrayzero(sbuf1, size));
2221     PetscCall(PetscArrayzero(ptr, size));
2222 
2223     {
2224       PetscInt *iptr = tmp;
2225       k              = 0;
2226       for (PetscMPIInt i = 0; i < nrqs; i++) {
2227         PetscMPIInt j = pa[i];
2228         iptr += k;
2229         sbuf1[j] = iptr;
2230         k        = w1[j];
2231       }
2232     }
2233 
2234     /* Form the outgoing messages. Initialize the header space */
2235     for (PetscMPIInt i = 0; i < nrqs; i++) {
2236       PetscMPIInt j = pa[i];
2237       sbuf1[j][0]   = 0;
2238       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2239       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2240     }
2241 
2242     /* Parse the isrow and copy data into outbuf */
2243     for (PetscInt i = 0; i < ismax; i++) {
2244       row2proc_i = row2proc[i];
2245       PetscCall(PetscArrayzero(ctr, size));
2246       irow_i = irow[i];
2247       jmax   = nrow[i];
2248       for (PetscInt j = 0; j < jmax; j++) { /* parse the indices of each IS */
2249         proc = row2proc_i[j];
2250         if (proc != rank) { /* copy to the outgoing buf */
2251           ctr[proc]++;
2252           *ptr[proc] = irow_i[j];
2253           ptr[proc]++;
2254         }
2255       }
2256       /* Update the headers for the current IS */
2257       for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
2258         if ((ctr_j = ctr[j])) {
2259           sbuf1_j            = sbuf1[j];
2260           k                  = ++sbuf1_j[0];
2261           sbuf1_j[2 * k]     = ctr_j;
2262           sbuf1_j[2 * k - 1] = i;
2263         }
2264       }
2265     }
2266 
2267     /*  Now  post the sends */
2268     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2269     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2270       PetscMPIInt j = pa[i];
2271       PetscCallMPI(MPIU_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2272     }
2273 
2274     /* Post Receives to capture the buffer size */
2275     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2276     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2277     if (nrqs) rbuf2[0] = tmp + msz;
2278     for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2279     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2280       PetscMPIInt j = pa[i];
2281       PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2282     }
2283 
2284     /* Send to other procs the buf size they should allocate */
2285     /* Receive messages*/
2286     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2287     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2288     {
2289       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2290       PetscInt *sbuf2_i;
2291 
2292       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2293       for (PetscMPIInt i = 0; i < nrqr; ++i) {
2294         req_size[i] = 0;
2295         rbuf1_i     = rbuf1[i];
2296         start       = 2 * rbuf1_i[0] + 1;
2297         end         = olengths1[i];
2298         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2299         sbuf2_i = sbuf2[i];
2300         for (PetscInt j = start; j < end; j++) {
2301           id         = rbuf1_i[j] - rstart;
2302           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2303           sbuf2_i[j] = ncols;
2304           req_size[i] += ncols;
2305         }
2306         req_source1[i] = onodes1[i];
2307         /* form the header */
2308         sbuf2_i[0] = req_size[i];
2309         for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2310 
2311         PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2312       }
2313     }
2314 
2315     PetscCall(PetscFree(onodes1));
2316     PetscCall(PetscFree(olengths1));
2317     PetscCall(PetscFree(r_waits1));
2318     PetscCall(PetscFree4(w1, w2, w3, w4));
2319 
2320     /* Receive messages*/
2321     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2322     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2323     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2324       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2325       req_source2[i] = pa[i];
2326       PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2327     }
2328     PetscCall(PetscFree(r_waits2));
2329 
2330     /* Wait on sends1 and sends2 */
2331     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2332     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2333     PetscCall(PetscFree(s_waits1));
2334     PetscCall(PetscFree(s_waits2));
2335 
2336     /* Now allocate sending buffers for a->j, and send them off */
2337     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2338     jcnt = 0;
2339     for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2340     if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
2341     for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2342 
2343     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2344     {
2345       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2346       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2347       PetscInt  cend = C->cmap->rend;
2348       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2349 
2350       for (PetscMPIInt i = 0; i < nrqr; i++) {
2351         rbuf1_i   = rbuf1[i];
2352         sbuf_aj_i = sbuf_aj[i];
2353         ct1       = 2 * rbuf1_i[0] + 1;
2354         ct2       = 0;
2355         for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2356           kmax = rbuf1[i][2 * j];
2357           for (PetscInt k = 0; k < kmax; k++, ct1++) {
2358             row    = rbuf1_i[ct1] - rstart;
2359             nzA    = a_i[row + 1] - a_i[row];
2360             nzB    = b_i[row + 1] - b_i[row];
2361             ncols  = nzA + nzB;
2362             cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]);
2363             cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2364 
2365             /* load the column indices for this row into cols */
2366             cols = sbuf_aj_i + ct2;
2367 
2368             lwrite = 0;
2369             for (PetscInt l = 0; l < nzB; l++) {
2370               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2371             }
2372             for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2373             for (PetscInt l = 0; l < nzB; l++) {
2374               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2375             }
2376 
2377             ct2 += ncols;
2378           }
2379         }
2380         PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2381       }
2382     }
2383 
2384     /* create col map: global col of C -> local col of submatrices */
2385     {
2386       const PetscInt *icol_i;
2387 #if defined(PETSC_USE_CTABLE)
2388       for (PetscInt i = 0; i < ismax; i++) {
2389         if (!allcolumns[i]) {
2390           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2391 
2392           jmax   = ncol[i];
2393           icol_i = icol[i];
2394           cmap_i = cmap[i];
2395           for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2396         } else cmap[i] = NULL;
2397       }
2398 #else
2399       for (PetscInt i = 0; i < ismax; i++) {
2400         if (!allcolumns[i]) {
2401           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2402           jmax   = ncol[i];
2403           icol_i = icol[i];
2404           cmap_i = cmap[i];
2405           for (PetscInt j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2406         } else cmap[i] = NULL;
2407       }
2408 #endif
2409     }
2410 
2411     /* Create lens which is required for MatCreate... */
2412     jcnt = 0;
2413     for (PetscInt i = 0; i < ismax; i++) jcnt += nrow[i];
2414     PetscCall(PetscMalloc1(ismax, &lens));
2415 
2416     if (ismax) PetscCall(PetscCalloc1(jcnt, &lens[0]));
2417     for (PetscInt i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]);
2418 
2419     /* Update lens from local data */
2420     for (PetscInt i = 0; i < ismax; i++) {
2421       row2proc_i = row2proc[i];
2422       jmax       = nrow[i];
2423       if (!allcolumns[i]) cmap_i = cmap[i];
2424       irow_i = irow[i];
2425       lens_i = lens[i];
2426       for (PetscInt j = 0; j < jmax; j++) {
2427         row  = irow_i[j];
2428         proc = row2proc_i[j];
2429         if (proc == rank) {
2430           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2431           if (!allcolumns[i]) {
2432             for (PetscInt k = 0; k < ncols; k++) {
2433 #if defined(PETSC_USE_CTABLE)
2434               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2435 #else
2436               tcol = cmap_i[cols[k]];
2437 #endif
2438               if (tcol) lens_i[j]++;
2439             }
2440           } else { /* allcolumns */
2441             lens_i[j] = ncols;
2442           }
2443           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2444         }
2445       }
2446     }
2447 
2448     /* Create row map: global row of C -> local row of submatrices */
2449 #if defined(PETSC_USE_CTABLE)
2450     for (PetscInt i = 0; i < ismax; i++) {
2451       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2452       irow_i = irow[i];
2453       jmax   = nrow[i];
2454       for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2455     }
2456 #else
2457     for (PetscInt i = 0; i < ismax; i++) {
2458       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2459       rmap_i = rmap[i];
2460       irow_i = irow[i];
2461       jmax   = nrow[i];
2462       for (PetscInt j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2463     }
2464 #endif
2465 
2466     /* Update lens from offproc data */
2467     {
2468       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2469 
2470       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2471       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2472         sbuf1_i = sbuf1[pa[tmp2]];
2473         jmax    = sbuf1_i[0];
2474         ct1     = 2 * jmax + 1;
2475         ct2     = 0;
2476         rbuf2_i = rbuf2[tmp2];
2477         rbuf3_i = rbuf3[tmp2];
2478         for (PetscInt j = 1; j <= jmax; j++) {
2479           is_no  = sbuf1_i[2 * j - 1];
2480           max1   = sbuf1_i[2 * j];
2481           lens_i = lens[is_no];
2482           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2483           rmap_i = rmap[is_no];
2484           for (PetscInt k = 0; k < max1; k++, ct1++) {
2485 #if defined(PETSC_USE_CTABLE)
2486             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2487             row--;
2488             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2489 #else
2490             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2491 #endif
2492             max2 = rbuf2_i[ct1];
2493             for (PetscInt l = 0; l < max2; l++, ct2++) {
2494               if (!allcolumns[is_no]) {
2495 #if defined(PETSC_USE_CTABLE)
2496                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2497 #else
2498                 tcol = cmap_i[rbuf3_i[ct2]];
2499 #endif
2500                 if (tcol) lens_i[row]++;
2501               } else {         /* allcolumns */
2502                 lens_i[row]++; /* lens_i[row] += max2 ? */
2503               }
2504             }
2505           }
2506         }
2507       }
2508     }
2509     PetscCall(PetscFree(r_waits3));
2510     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2511     PetscCall(PetscFree(s_waits3));
2512 
2513     /* Create the submatrices */
2514     for (PetscInt i = 0; i < ismax; i++) {
2515       PetscInt rbs, cbs;
2516 
2517       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2518       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2519 
2520       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2521       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2522 
2523       if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2524       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2525       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2526 
2527       /* create struct Mat_SubSppt and attached it to submat */
2528       PetscCall(PetscNew(&smat_i));
2529       subc            = (Mat_SeqAIJ *)submats[i]->data;
2530       subc->submatis1 = smat_i;
2531 
2532       smat_i->destroy          = submats[i]->ops->destroy;
2533       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2534       submats[i]->factortype   = C->factortype;
2535 
2536       smat_i->id          = i;
2537       smat_i->nrqs        = nrqs;
2538       smat_i->nrqr        = nrqr;
2539       smat_i->rbuf1       = rbuf1;
2540       smat_i->rbuf2       = rbuf2;
2541       smat_i->rbuf3       = rbuf3;
2542       smat_i->sbuf2       = sbuf2;
2543       smat_i->req_source2 = req_source2;
2544 
2545       smat_i->sbuf1 = sbuf1;
2546       smat_i->ptr   = ptr;
2547       smat_i->tmp   = tmp;
2548       smat_i->ctr   = ctr;
2549 
2550       smat_i->pa          = pa;
2551       smat_i->req_size    = req_size;
2552       smat_i->req_source1 = req_source1;
2553 
2554       smat_i->allcolumns = allcolumns[i];
2555       smat_i->singleis   = PETSC_FALSE;
2556       smat_i->row2proc   = row2proc[i];
2557       smat_i->rmap       = rmap[i];
2558       smat_i->cmap       = cmap[i];
2559     }
2560 
2561     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2562       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2563       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2564       PetscCall(MatSetType(submats[0], MATDUMMY));
2565 
2566       /* create struct Mat_SubSppt and attached it to submat */
2567       PetscCall(PetscNew(&smat_i));
2568       submats[0]->data = (void *)smat_i;
2569 
2570       smat_i->destroy          = submats[0]->ops->destroy;
2571       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2572       submats[0]->factortype   = C->factortype;
2573 
2574       smat_i->id          = 0;
2575       smat_i->nrqs        = nrqs;
2576       smat_i->nrqr        = nrqr;
2577       smat_i->rbuf1       = rbuf1;
2578       smat_i->rbuf2       = rbuf2;
2579       smat_i->rbuf3       = rbuf3;
2580       smat_i->sbuf2       = sbuf2;
2581       smat_i->req_source2 = req_source2;
2582 
2583       smat_i->sbuf1 = sbuf1;
2584       smat_i->ptr   = ptr;
2585       smat_i->tmp   = tmp;
2586       smat_i->ctr   = ctr;
2587 
2588       smat_i->pa          = pa;
2589       smat_i->req_size    = req_size;
2590       smat_i->req_source1 = req_source1;
2591 
2592       smat_i->allcolumns = PETSC_FALSE;
2593       smat_i->singleis   = PETSC_FALSE;
2594       smat_i->row2proc   = NULL;
2595       smat_i->rmap       = NULL;
2596       smat_i->cmap       = NULL;
2597     }
2598 
2599     if (ismax) PetscCall(PetscFree(lens[0]));
2600     PetscCall(PetscFree(lens));
2601     if (sbuf_aj) {
2602       PetscCall(PetscFree(sbuf_aj[0]));
2603       PetscCall(PetscFree(sbuf_aj));
2604     }
2605 
2606   } /* endof scall == MAT_INITIAL_MATRIX */
2607 
2608   /* Post recv matrix values */
2609   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2610   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2611   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2612   for (PetscMPIInt i = 0; i < nrqs; ++i) {
2613     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2614     PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2615   }
2616 
2617   /* Allocate sending buffers for a->a, and send them off */
2618   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2619   jcnt = 0;
2620   for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2621   if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
2622   for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2623 
2624   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2625   {
2626     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2627     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2628     PetscInt     cend = C->cmap->rend;
2629     PetscInt    *b_j  = b->j;
2630     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2631 
2632     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2633     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2634     for (PetscMPIInt i = 0; i < nrqr; i++) {
2635       rbuf1_i   = rbuf1[i];
2636       sbuf_aa_i = sbuf_aa[i];
2637       ct1       = 2 * rbuf1_i[0] + 1;
2638       ct2       = 0;
2639       for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2640         kmax = rbuf1_i[2 * j];
2641         for (PetscInt k = 0; k < kmax; k++, ct1++) {
2642           row    = rbuf1_i[ct1] - rstart;
2643           nzA    = a_i[row + 1] - a_i[row];
2644           nzB    = b_i[row + 1] - b_i[row];
2645           ncols  = nzA + nzB;
2646           cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2647           vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]);
2648           vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]);
2649 
2650           /* load the column values for this row into vals*/
2651           vals = sbuf_aa_i + ct2;
2652 
2653           lwrite = 0;
2654           for (PetscInt l = 0; l < nzB; l++) {
2655             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2656           }
2657           for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2658           for (PetscInt l = 0; l < nzB; l++) {
2659             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2660           }
2661 
2662           ct2 += ncols;
2663         }
2664       }
2665       PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2666     }
2667     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2668     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2669   }
2670 
2671   /* Assemble the matrices */
2672   /* First assemble the local rows */
2673   for (PetscInt i = 0; i < ismax; i++) {
2674     row2proc_i = row2proc[i];
2675     subc       = (Mat_SeqAIJ *)submats[i]->data;
2676     imat_ilen  = subc->ilen;
2677     imat_j     = subc->j;
2678     imat_i     = subc->i;
2679     imat_a     = subc->a;
2680 
2681     if (!allcolumns[i]) cmap_i = cmap[i];
2682     rmap_i = rmap[i];
2683     irow_i = irow[i];
2684     jmax   = nrow[i];
2685     for (PetscInt j = 0; j < jmax; j++) {
2686       row  = irow_i[j];
2687       proc = row2proc_i[j];
2688       if (proc == rank) {
2689         old_row = row;
2690 #if defined(PETSC_USE_CTABLE)
2691         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2692         row--;
2693 #else
2694         row = rmap_i[row];
2695 #endif
2696         ilen_row = imat_ilen[row];
2697         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2698         mat_i = imat_i[row];
2699         mat_a = imat_a + mat_i;
2700         mat_j = imat_j + mat_i;
2701         if (!allcolumns[i]) {
2702           for (PetscInt k = 0; k < ncols; k++) {
2703 #if defined(PETSC_USE_CTABLE)
2704             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2705 #else
2706             tcol = cmap_i[cols[k]];
2707 #endif
2708             if (tcol) {
2709               *mat_j++ = tcol - 1;
2710               *mat_a++ = vals[k];
2711               ilen_row++;
2712             }
2713           }
2714         } else { /* allcolumns */
2715           for (PetscInt k = 0; k < ncols; k++) {
2716             *mat_j++ = cols[k]; /* global col index! */
2717             *mat_a++ = vals[k];
2718             ilen_row++;
2719           }
2720         }
2721         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2722 
2723         imat_ilen[row] = ilen_row;
2724       }
2725     }
2726   }
2727 
2728   /* Now assemble the off proc rows */
2729   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2730   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2731     sbuf1_i = sbuf1[pa[tmp2]];
2732     jmax    = sbuf1_i[0];
2733     ct1     = 2 * jmax + 1;
2734     ct2     = 0;
2735     rbuf2_i = rbuf2[tmp2];
2736     rbuf3_i = rbuf3[tmp2];
2737     rbuf4_i = rbuf4[tmp2];
2738     for (PetscInt j = 1; j <= jmax; j++) {
2739       is_no  = sbuf1_i[2 * j - 1];
2740       rmap_i = rmap[is_no];
2741       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2742       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2743       imat_ilen = subc->ilen;
2744       imat_j    = subc->j;
2745       imat_i    = subc->i;
2746       imat_a    = subc->a;
2747       max1      = sbuf1_i[2 * j];
2748       for (PetscInt k = 0; k < max1; k++, ct1++) {
2749         row = sbuf1_i[ct1];
2750 #if defined(PETSC_USE_CTABLE)
2751         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2752         row--;
2753 #else
2754         row = rmap_i[row];
2755 #endif
2756         ilen  = imat_ilen[row];
2757         mat_i = imat_i[row];
2758         mat_a = PetscSafePointerPlusOffset(imat_a, mat_i);
2759         mat_j = PetscSafePointerPlusOffset(imat_j, mat_i);
2760         max2  = rbuf2_i[ct1];
2761         if (!allcolumns[is_no]) {
2762           for (PetscInt l = 0; l < max2; l++, ct2++) {
2763 #if defined(PETSC_USE_CTABLE)
2764             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2765 #else
2766             tcol = cmap_i[rbuf3_i[ct2]];
2767 #endif
2768             if (tcol) {
2769               *mat_j++ = tcol - 1;
2770               *mat_a++ = rbuf4_i[ct2];
2771               ilen++;
2772             }
2773           }
2774         } else { /* allcolumns */
2775           for (PetscInt l = 0; l < max2; l++, ct2++) {
2776             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2777             *mat_a++ = rbuf4_i[ct2];
2778             ilen++;
2779           }
2780         }
2781         imat_ilen[row] = ilen;
2782       }
2783     }
2784   }
2785 
2786   if (!iscsorted) { /* sort column indices of the rows */
2787     for (PetscInt i = 0; i < ismax; i++) {
2788       subc      = (Mat_SeqAIJ *)submats[i]->data;
2789       imat_j    = subc->j;
2790       imat_i    = subc->i;
2791       imat_a    = subc->a;
2792       imat_ilen = subc->ilen;
2793 
2794       if (allcolumns[i]) continue;
2795       jmax = nrow[i];
2796       for (PetscInt j = 0; j < jmax; j++) {
2797         mat_i = imat_i[j];
2798         mat_a = imat_a + mat_i;
2799         mat_j = imat_j + mat_i;
2800         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2801       }
2802     }
2803   }
2804 
2805   PetscCall(PetscFree(r_waits4));
2806   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2807   PetscCall(PetscFree(s_waits4));
2808 
2809   /* Restore the indices */
2810   for (PetscInt i = 0; i < ismax; i++) {
2811     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2812     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2813   }
2814 
2815   for (PetscInt i = 0; i < ismax; i++) {
2816     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2817     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2818   }
2819 
2820   /* Destroy allocated memory */
2821   if (sbuf_aa) {
2822     PetscCall(PetscFree(sbuf_aa[0]));
2823     PetscCall(PetscFree(sbuf_aa));
2824   }
2825   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2826 
2827   for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2828   PetscCall(PetscFree(rbuf4));
2829 
2830   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2831   PetscFunctionReturn(PETSC_SUCCESS);
2832 }
2833 
2834 /*
2835  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2836  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2837  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2838  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2839  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2840  state, and needs to be "assembled" later by compressing B's column space.
2841 
2842  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2843  Following this call, C->A & C->B have been created, even if empty.
2844  */
2845 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2846 {
2847   /* If making this function public, change the error returned in this function away from _PLIB. */
2848   Mat_MPIAIJ     *aij;
2849   Mat_SeqAIJ     *Baij;
2850   PetscBool       seqaij, Bdisassembled;
2851   PetscInt        m, n, *nz, ngcol, col, cstart, cend, shift, count;
2852   PetscScalar     v;
2853   const PetscInt *rowindices, *colindices;
2854 
2855   PetscFunctionBegin;
2856   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2857   if (A) {
2858     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2859     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2860     if (rowemb) {
2861       PetscCall(ISGetLocalSize(rowemb, &m));
2862       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2863     } else {
2864       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2865     }
2866     if (dcolemb) {
2867       PetscCall(ISGetLocalSize(dcolemb, &n));
2868       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2869     } else {
2870       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2871     }
2872   }
2873   if (B) {
2874     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2875     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2876     if (rowemb) {
2877       PetscCall(ISGetLocalSize(rowemb, &m));
2878       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2879     } else {
2880       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2881     }
2882     if (ocolemb) {
2883       PetscCall(ISGetLocalSize(ocolemb, &n));
2884       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2885     } else {
2886       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2887     }
2888   }
2889 
2890   aij = (Mat_MPIAIJ *)C->data;
2891   if (!aij->A) {
2892     /* Mimic parts of MatMPIAIJSetPreallocation() */
2893     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2894     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2895     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2896     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2897   }
2898   if (A) {
2899     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2900   } else {
2901     PetscCall(MatSetUp(aij->A));
2902   }
2903   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2904     /*
2905       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2906       need to "disassemble" B -- convert it to using C's global indices.
2907       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2908 
2909       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2910       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2911 
2912       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2913       At least avoid calling MatSetValues() and the implied searches?
2914     */
2915 
2916     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2917 #if defined(PETSC_USE_CTABLE)
2918       PetscCall(PetscHMapIDestroy(&aij->colmap));
2919 #else
2920       PetscCall(PetscFree(aij->colmap));
2921       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2922 #endif
2923       ngcol = 0;
2924       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2925       if (aij->garray) PetscCall(PetscFree(aij->garray));
2926       PetscCall(VecDestroy(&aij->lvec));
2927       PetscCall(VecScatterDestroy(&aij->Mvctx));
2928     }
2929     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2930     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2931   }
2932   Bdisassembled = PETSC_FALSE;
2933   if (!aij->B) {
2934     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2935     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2936     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2937     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2938     Bdisassembled = PETSC_TRUE;
2939   }
2940   if (B) {
2941     Baij = (Mat_SeqAIJ *)B->data;
2942     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2943       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2944       for (PetscInt i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2945       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2946       PetscCall(PetscFree(nz));
2947     }
2948 
2949     PetscCall(PetscLayoutGetRange(C->cmap, &cstart, &cend));
2950     shift      = cend - cstart;
2951     count      = 0;
2952     rowindices = NULL;
2953     colindices = NULL;
2954     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2955     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2956     for (PetscInt i = 0; i < B->rmap->n; i++) {
2957       PetscInt row;
2958       row = i;
2959       if (rowindices) row = rowindices[i];
2960       for (PetscInt j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2961         col = Baij->j[count];
2962         if (colindices) col = colindices[col];
2963         if (Bdisassembled && col >= cstart) col += shift;
2964         v = Baij->a[count];
2965         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2966         ++count;
2967       }
2968     }
2969     /* No assembly for aij->B is necessary. */
2970     /* FIXME: set aij->B's nonzerostate correctly. */
2971   } else {
2972     PetscCall(MatSetUp(aij->B));
2973   }
2974   C->preallocated  = PETSC_TRUE;
2975   C->was_assembled = PETSC_FALSE;
2976   C->assembled     = PETSC_FALSE;
2977   /*
2978       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2979       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2980    */
2981   PetscFunctionReturn(PETSC_SUCCESS);
2982 }
2983 
2984 /*
2985   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2986  */
2987 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2988 {
2989   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
2990 
2991   PetscFunctionBegin;
2992   PetscAssertPointer(A, 2);
2993   PetscAssertPointer(B, 3);
2994   /* FIXME: make sure C is assembled */
2995   *A = aij->A;
2996   *B = aij->B;
2997   /* Note that we don't incref *A and *B, so be careful! */
2998   PetscFunctionReturn(PETSC_SUCCESS);
2999 }
3000 
3001 /*
3002   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3003   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3004 */
3005 static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3006 {
3007   PetscMPIInt size, flag;
3008   PetscInt    cismax, ispar;
3009   Mat        *A, *B;
3010   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3011 
3012   PetscFunctionBegin;
3013   if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);
3014 
3015   cismax = 0;
3016   for (PetscInt i = 0; i < ismax; ++i) {
3017     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3018     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3019     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3020     if (size > 1) ++cismax;
3021   }
3022 
3023   /*
3024      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3025      ispar counts the number of parallel ISs across C's comm.
3026   */
3027   PetscCallMPI(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3028   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3029     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3030     PetscFunctionReturn(PETSC_SUCCESS);
3031   }
3032 
3033   /* if (ispar) */
3034   /*
3035     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3036     These are used to extract the off-diag portion of the resulting parallel matrix.
3037     The row IS for the off-diag portion is the same as for the diag portion,
3038     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3039   */
3040   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3041   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3042   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3043     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3044     if (size > 1) {
3045       /*
3046          TODO: This is the part that's ***NOT SCALABLE***.
3047          To fix this we need to extract just the indices of C's nonzero columns
3048          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3049          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3050          to be done without serializing on the IS list, so, most likely, it is best
3051          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3052       */
3053       PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii]));
3054       /* Now we have to
3055          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3056              were sorted on each rank, concatenated they might no longer be sorted;
3057          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3058              indices in the nondecreasing order to the original index positions.
3059          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3060       */
3061       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3062       PetscCall(ISSort(ciscol[ii]));
3063       ++ii;
3064     }
3065   }
3066   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3067   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3068     PetscInt        issize;
3069     const PetscInt *indices;
3070 
3071     /*
3072        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3073      */
3074     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3075     PetscCall(ISSort(isrow[i]));
3076     PetscCall(ISGetLocalSize(isrow[i], &issize));
3077     PetscCall(ISGetIndices(isrow[i], &indices));
3078     for (PetscInt j = 1; j < issize; ++j) {
3079       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3080     }
3081     PetscCall(ISRestoreIndices(isrow[i], &indices));
3082     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3083     PetscCall(ISSort(iscol[i]));
3084     PetscCall(ISGetLocalSize(iscol[i], &issize));
3085     PetscCall(ISGetIndices(iscol[i], &indices));
3086     for (PetscInt j = 1; j < issize; ++j) {
3087       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3088     }
3089     PetscCall(ISRestoreIndices(iscol[i], &indices));
3090     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3091     if (size > 1) {
3092       cisrow[ii] = isrow[i];
3093       ++ii;
3094     }
3095   }
3096   /*
3097     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3098     array of sequential matrices underlying the resulting parallel matrices.
3099     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3100     contain duplicates.
3101 
3102     There are as many diag matrices as there are original index sets. There are only as many parallel
3103     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3104 
3105     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3106     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3107       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3108       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3109       with A[i] and B[ii] extracted from the corresponding MPI submat.
3110     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3111       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3112       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3113       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3114       values into A[i] and B[ii] sitting inside the corresponding submat.
3115     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3116       A[i], B[ii], AA[i] or BB[ii] matrices.
3117   */
3118   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3119   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3120 
3121   /* Now obtain the sequential A and B submatrices separately. */
3122   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3123   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3124   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3125 
3126   /*
3127     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3128     matrices A & B have been extracted directly into the parallel matrices containing them, or
3129     simply into the sequential matrix identical with the corresponding A (if size == 1).
3130     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3131     to have the same sparsity pattern.
3132     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3133     must be constructed for C. This is done by setseqmat(s).
3134   */
3135   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3136     /*
3137        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3138        That way we can avoid sorting and computing permutations when reusing.
3139        To this end:
3140         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3141         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3142           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3143     */
3144     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3145 
3146     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3147     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3148     if (size > 1) {
3149       if (scall == MAT_INITIAL_MATRIX) {
3150         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3151         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3152         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3153         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3154         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3155       }
3156       /*
3157         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3158       */
3159       {
3160         Mat AA = A[i], BB = B[ii];
3161 
3162         if (AA || BB) {
3163           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3164           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3165           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3166         }
3167         PetscCall(MatDestroy(&AA));
3168       }
3169       PetscCall(ISDestroy(ciscol + ii));
3170       PetscCall(ISDestroy(ciscol_p + ii));
3171       ++ii;
3172     } else { /* if (size == 1) */
3173       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3174       if (isrow_p[i] || iscol_p[i]) {
3175         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3176         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3177         /* Otherwise A is extracted straight into (*submats)[i]. */
3178         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3179         PetscCall(MatDestroy(A + i));
3180       } else (*submat)[i] = A[i];
3181     }
3182     PetscCall(ISDestroy(&isrow_p[i]));
3183     PetscCall(ISDestroy(&iscol_p[i]));
3184   }
3185   PetscCall(PetscFree2(cisrow, ciscol));
3186   PetscCall(PetscFree2(isrow_p, iscol_p));
3187   PetscCall(PetscFree(ciscol_p));
3188   PetscCall(PetscFree(A));
3189   PetscCall(MatDestroySubMatrices(cismax, &B));
3190   PetscFunctionReturn(PETSC_SUCCESS);
3191 }
3192 
3193 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3194 {
3195   PetscFunctionBegin;
3196   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3197   PetscFunctionReturn(PETSC_SUCCESS);
3198 }
3199