xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 9b529ac9d3e6c2bd81b4bb2dab3f698fdbcd19f2)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 /*
22    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
23 
24    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25    that is a very major recoding job.
26 
27    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28 */
29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30 {
31   PetscFunctionBegin;
32   for (PetscInt i = 0; i < imax; i++) {
33     if (!is[i]) break;
34     PetscInt        n = 0, N, Nmax, Nmin;
35     const PetscInt *idx;
36     PetscInt       *nidx = NULL;
37     MPI_Comm        comm;
38     PetscBT         bt;
39 
40     PetscCall(ISGetLocalSize(is[i], &N));
41     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42       PetscCall(ISGetIndices(is[i], &idx));
43       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44       Nmin = Nmin / bs;
45       Nmax = Nmax / bs;
46       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47       for (PetscInt j = 0; j < N; j++) {
48         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49       }
50       PetscCall(PetscMalloc1(n, &nidx));
51       n = 0;
52       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53       for (PetscInt j = 0; j < N; j++) {
54         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55       }
56       PetscCall(PetscBTDestroy(&bt));
57       PetscCall(ISRestoreIndices(is[i], &idx));
58     }
59     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60     PetscCall(ISDestroy(is + i));
61     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67 {
68   PetscInt i;
69 
70   PetscFunctionBegin;
71   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74   PetscFunctionReturn(PETSC_SUCCESS);
75 }
76 
77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78 {
79   PetscInt i;
80 
81   PetscFunctionBegin;
82   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89 {
90   MPI_Comm        comm;
91   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_isids, j;
92   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94   const PetscInt *indices_i, **indices;
95   PetscLayout     rmap;
96   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner, *rrow_ranks;
97   PetscSF         sf;
98   PetscSFNode    *remote;
99 
100   PetscFunctionBegin;
101   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103   PetscCallMPI(MPI_Comm_size(comm, &size));
104   /* get row map to determine where rows should be going */
105   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106   /* retrieve IS data and put all together so that we
107    * can optimize communication
108    *  */
109   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110   tlength = 0;
111   for (PetscInt i = 0; i < nidx; i++) {
112     PetscCall(ISGetLocalSize(is[i], &length[i]));
113     tlength += length[i];
114     PetscCall(ISGetIndices(is[i], &indices[i]));
115   }
116   /* find these rows on remote processors */
117   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
118   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
119   nrrows = 0;
120   for (PetscInt i = 0; i < nidx; i++) {
121     length_i  = length[i];
122     indices_i = indices[i];
123     for (PetscInt j = 0; j < length_i; j++) {
124       owner = -1;
125       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
126       /* remote processors */
127       if (owner != rank) {
128         tosizes_temp[owner]++;               /* number of rows to owner */
129         rrow_ranks[nrrows]   = owner;        /* processor */
130         rrow_isids[nrrows]   = i;            /* is id */
131         remoterows[nrrows++] = indices_i[j]; /* row */
132       }
133     }
134     PetscCall(ISRestoreIndices(is[i], &indices[i]));
135   }
136   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
137   /* test if we need to exchange messages
138    * generally speaking, we do not need to exchange
139    * data when overlap is 1
140    * */
141   PetscCallMPI(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
142   /* we do not have any messages
143    * It usually corresponds to overlap 1
144    * */
145   if (!reducednrrows) {
146     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
147     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
148     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
149     PetscFunctionReturn(PETSC_SUCCESS);
150   }
151   nto = 0;
152   /* send sizes and ranks for building a two-sided communication */
153   for (PetscMPIInt i = 0; i < size; i++) {
154     if (tosizes_temp[i]) {
155       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
156       tosizes_temp[i]  = nto;                 /* a map from processor to index */
157       toranks[nto++]   = i;                   /* MPI process */
158     }
159   }
160   PetscCall(PetscMalloc1(nto + 1, &toffsets));
161   toffsets[0] = 0;
162   for (PetscInt i = 0; i < nto; i++) {
163     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
164     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
165   }
166   /* send information to other processors */
167   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
168   nrecvrows = 0;
169   for (PetscMPIInt i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
170   PetscCall(PetscMalloc1(nrecvrows, &remote));
171   nrecvrows = 0;
172   for (PetscMPIInt i = 0; i < nfrom; i++) {
173     for (PetscInt j = 0; j < fromsizes[2 * i]; j++) {
174       remote[nrecvrows].rank    = fromranks[i];
175       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
176     }
177   }
178   PetscCall(PetscSFCreate(comm, &sf));
179   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
180   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
181   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
182   PetscCall(PetscSFSetFromOptions(sf));
183   /* message pair <no of is, row>  */
184   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
185   for (PetscInt i = 0; i < nrrows; i++) {
186     owner                 = rrow_ranks[i];       /* process */
187     j                     = tosizes_temp[owner]; /* index */
188     todata[toffsets[j]++] = rrow_isids[i];
189     todata[toffsets[j]++] = remoterows[i];
190   }
191   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
192   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
193   PetscCall(PetscFree(toffsets));
194   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
196   PetscCall(PetscSFDestroy(&sf));
197   /* send rows belonging to the remote so that then we could get the overlapping data back */
198   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
199   PetscCall(PetscFree2(todata, fromdata));
200   PetscCall(PetscFree(fromsizes));
201   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
202   PetscCall(PetscFree(fromranks));
203   nrecvrows = 0;
204   for (PetscInt i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
205   PetscCall(PetscCalloc1(nrecvrows, &todata));
206   PetscCall(PetscMalloc1(nrecvrows, &remote));
207   nrecvrows = 0;
208   for (PetscInt i = 0; i < nto; i++) {
209     for (PetscInt j = 0; j < tosizes[2 * i]; j++) {
210       remote[nrecvrows].rank    = toranks[i];
211       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
212     }
213   }
214   PetscCall(PetscSFCreate(comm, &sf));
215   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
216   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
217   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
218   PetscCall(PetscSFSetFromOptions(sf));
219   /* overlap communication and computation */
220   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
221   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
222   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
223   PetscCall(PetscSFDestroy(&sf));
224   PetscCall(PetscFree2(sbdata, sbsizes));
225   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
226   PetscCall(PetscFree(toranks));
227   PetscCall(PetscFree(tosizes));
228   PetscCall(PetscFree(todata));
229   PetscFunctionReturn(PETSC_SUCCESS);
230 }
231 
232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
233 {
234   PetscInt       *isz, isz_i, i, j, is_id, data_size;
235   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
236   const PetscInt *indices_i_temp;
237   MPI_Comm       *iscomms;
238 
239   PetscFunctionBegin;
240   max_lsize = 0;
241   PetscCall(PetscMalloc1(nidx, &isz));
242   for (i = 0; i < nidx; i++) {
243     PetscCall(ISGetLocalSize(is[i], &lsize));
244     max_lsize = lsize > max_lsize ? lsize : max_lsize;
245     isz[i]    = lsize;
246   }
247   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
248   for (i = 0; i < nidx; i++) {
249     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
250     PetscCall(ISGetIndices(is[i], &indices_i_temp));
251     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i]));
252     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
253     PetscCall(ISDestroy(&is[i]));
254   }
255   /* retrieve information to get row id and its overlap */
256   for (i = 0; i < nrecvs;) {
257     is_id     = recvdata[i++];
258     data_size = recvdata[i++];
259     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
260     isz_i     = isz[is_id];
261     for (j = 0; j < data_size; j++) {
262       col                = recvdata[i++];
263       indices_i[isz_i++] = col;
264     }
265     isz[is_id] = isz_i;
266   }
267   /* remove duplicate entities */
268   for (i = 0; i < nidx; i++) {
269     indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i);
270     isz_i     = isz[i];
271     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
272     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
273     PetscCall(PetscCommDestroy(&iscomms[i]));
274   }
275   PetscCall(PetscFree(isz));
276   PetscCall(PetscFree2(indices_temp, iscomms));
277   PetscFunctionReturn(PETSC_SUCCESS);
278 }
279 
280 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
281 {
282   PetscLayout     rmap, cmap;
283   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
284   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
285   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
286   const PetscInt *gcols, *ai, *aj, *bi, *bj;
287   Mat             amat, bmat;
288   PetscMPIInt     rank;
289   PetscBool       done;
290   MPI_Comm        comm;
291 
292   PetscFunctionBegin;
293   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
294   PetscCallMPI(MPI_Comm_rank(comm, &rank));
295   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
296   /* Even if the mat is symmetric, we still assume it is not symmetric */
297   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
298   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
299   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
300   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
301   /* total number of nonzero values is used to estimate the memory usage in the next step */
302   tnz = ai[an] + bi[bn];
303   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
304   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
305   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
306   /* to find the longest message */
307   max_fszs = 0;
308   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
309   /* better way to estimate number of nonzero in the mat??? */
310   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
311   for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i);
312   rows_pos  = 0;
313   totalrows = 0;
314   for (i = 0; i < nfrom; i++) {
315     PetscCall(PetscArrayzero(rows_pos_i, nidx));
316     /* group data together */
317     for (j = 0; j < fromsizes[2 * i]; j += 2) {
318       is_id                       = fromrows[rows_pos++]; /* no of is */
319       rows_i                      = rows_data[is_id];
320       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
321     }
322     /* estimate a space to avoid multiple allocations  */
323     for (j = 0; j < nidx; j++) {
324       indvc_ij = 0;
325       rows_i   = rows_data[j];
326       for (l = 0; l < rows_pos_i[j]; l++) {
327         row   = rows_i[l] - rstart;
328         start = ai[row];
329         end   = ai[row + 1];
330         for (k = start; k < end; k++) { /* Amat */
331           col                     = aj[k] + cstart;
332           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
333         }
334         start = bi[row];
335         end   = bi[row + 1];
336         for (k = start; k < end; k++) { /* Bmat */
337           col                     = gcols[bj[k]];
338           indices_tmp[indvc_ij++] = col;
339         }
340       }
341       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
342       indv_counts[i * nidx + j] = indvc_ij;
343       totalrows += indvc_ij;
344     }
345   }
346   /* message triple <no of is, number of rows, rows> */
347   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
348   totalrows = 0;
349   rows_pos  = 0;
350   /* use this code again */
351   for (i = 0; i < nfrom; i++) {
352     PetscCall(PetscArrayzero(rows_pos_i, nidx));
353     for (j = 0; j < fromsizes[2 * i]; j += 2) {
354       is_id                       = fromrows[rows_pos++];
355       rows_i                      = rows_data[is_id];
356       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
357     }
358     /* add data  */
359     for (j = 0; j < nidx; j++) {
360       if (!indv_counts[i * nidx + j]) continue;
361       indvc_ij            = 0;
362       sbdata[totalrows++] = j;
363       sbdata[totalrows++] = indv_counts[i * nidx + j];
364       sbsizes[2 * i] += 2;
365       rows_i = rows_data[j];
366       for (l = 0; l < rows_pos_i[j]; l++) {
367         row   = rows_i[l] - rstart;
368         start = ai[row];
369         end   = ai[row + 1];
370         for (k = start; k < end; k++) { /* Amat */
371           col                     = aj[k] + cstart;
372           indices_tmp[indvc_ij++] = col;
373         }
374         start = bi[row];
375         end   = bi[row + 1];
376         for (k = start; k < end; k++) { /* Bmat */
377           col                     = gcols[bj[k]];
378           indices_tmp[indvc_ij++] = col;
379         }
380       }
381       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
382       sbsizes[2 * i] += indvc_ij;
383       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
384       totalrows += indvc_ij;
385     }
386   }
387   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
388   offsets[0] = 0;
389   for (i = 0; i < nfrom; i++) {
390     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
391     sbsizes[2 * i + 1] = offsets[i];
392   }
393   PetscCall(PetscFree(offsets));
394   if (sbrowsizes) *sbrowsizes = sbsizes;
395   if (sbrows) *sbrows = sbdata;
396   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
397   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
398   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
403 {
404   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
405   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
406   PetscInt        lsize, lsize_tmp;
407   PetscMPIInt     rank, owner;
408   Mat             amat, bmat;
409   PetscBool       done;
410   PetscLayout     cmap, rmap;
411   MPI_Comm        comm;
412 
413   PetscFunctionBegin;
414   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
415   PetscCallMPI(MPI_Comm_rank(comm, &rank));
416   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
417   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
418   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
419   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
420   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
421   /* is it a safe way to compute number of nonzero values ? */
422   tnz = ai[an] + bi[bn];
423   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
424   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
425   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
426   /* it is a better way to estimate memory than the old implementation
427    * where global size of matrix is used
428    * */
429   PetscCall(PetscMalloc1(tnz, &indices_temp));
430   for (i = 0; i < nidx; i++) {
431     MPI_Comm iscomm;
432 
433     PetscCall(ISGetLocalSize(is[i], &lsize));
434     PetscCall(ISGetIndices(is[i], &indices));
435     lsize_tmp = 0;
436     for (j = 0; j < lsize; j++) {
437       owner = -1;
438       row   = indices[j];
439       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
440       if (owner != rank) continue;
441       /* local number */
442       row -= rstart;
443       start = ai[row];
444       end   = ai[row + 1];
445       for (k = start; k < end; k++) { /* Amat */
446         col                       = aj[k] + cstart;
447         indices_temp[lsize_tmp++] = col;
448       }
449       start = bi[row];
450       end   = bi[row + 1];
451       for (k = start; k < end; k++) { /* Bmat */
452         col                       = gcols[bj[k]];
453         indices_temp[lsize_tmp++] = col;
454       }
455     }
456     PetscCall(ISRestoreIndices(is[i], &indices));
457     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
458     PetscCall(ISDestroy(&is[i]));
459     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
460     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
461     PetscCall(PetscCommDestroy(&iscomm));
462   }
463   PetscCall(PetscFree(indices_temp));
464   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
465   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
466   PetscFunctionReturn(PETSC_SUCCESS);
467 }
468 
469 /*
470   Sample message format:
471   If a processor A wants processor B to process some elements corresponding
472   to index sets is[1],is[5]
473   mesg [0] = 2   (no of index sets in the mesg)
474   -----------
475   mesg [1] = 1 => is[1]
476   mesg [2] = sizeof(is[1]);
477   -----------
478   mesg [3] = 5  => is[5]
479   mesg [4] = sizeof(is[5]);
480   -----------
481   mesg [5]
482   mesg [n]  datas[1]
483   -----------
484   mesg[n+1]
485   mesg[m]  data(is[5])
486   -----------
487 
488   Notes:
489   nrqs - no of requests sent (or to be sent out)
490   nrqr - no of requests received (which have to be or which have been processed)
491 */
492 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
493 {
494   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
495   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
496   const PetscInt **idx, *idx_i;
497   PetscInt        *n, **data, len;
498 #if defined(PETSC_USE_CTABLE)
499   PetscHMapI *table_data, table_data_i;
500   PetscInt   *tdata, tcount, tcount_max;
501 #else
502   PetscInt *data_i, *d_p;
503 #endif
504   PetscMPIInt  size, rank, tag1, tag2, proc = 0, nrqs, *pa;
505   PetscInt     M, k, **rbuf, row, msz, **outdat, **ptr, *isz1;
506   PetscInt    *ctr, *tmp, *isz, **xdata, **rbuf2;
507   PetscBT     *table;
508   MPI_Comm     comm;
509   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
510   MPI_Status  *recv_status;
511   MPI_Comm    *iscomms;
512   char        *t_p;
513 
514   PetscFunctionBegin;
515   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
516   size = c->size;
517   rank = c->rank;
518   M    = C->rmap->N;
519 
520   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
521   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
522 
523   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
524 
525   for (PetscInt i = 0; i < imax; i++) {
526     PetscCall(ISGetIndices(is[i], &idx[i]));
527     PetscCall(ISGetLocalSize(is[i], &n[i]));
528   }
529 
530   /* evaluate communication - mesg to who,length of mesg, and buffer space
531      required. Based on this, buffers are allocated, and data copied into them  */
532   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
533   for (PetscInt i = 0; i < imax; i++) {
534     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
535     idx_i = idx[i];
536     len   = n[i];
537     for (PetscInt j = 0; j < len; j++) {
538       row = idx_i[j];
539       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
540       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
541       w4[proc]++;
542     }
543     for (PetscMPIInt j = 0; j < size; j++) {
544       if (w4[j]) {
545         w1[j] += w4[j];
546         w3[j]++;
547       }
548     }
549   }
550 
551   nrqs     = 0; /* no of outgoing messages */
552   msz      = 0; /* total mesg length (for all proc */
553   w1[rank] = 0; /* no mesg sent to intself */
554   w3[rank] = 0;
555   for (PetscMPIInt i = 0; i < size; i++) {
556     if (w1[i]) {
557       w2[i] = 1;
558       nrqs++;
559     } /* there exists a message to proc i */
560   }
561   /* pa - is list of processors to communicate with */
562   PetscCall(PetscMalloc1(nrqs, &pa));
563   for (PetscMPIInt i = 0, j = 0; i < size; i++) {
564     if (w1[i]) {
565       pa[j] = i;
566       j++;
567     }
568   }
569 
570   /* Each message would have a header = 1 + 2*(no of IS) + data */
571   for (PetscMPIInt i = 0; i < nrqs; i++) {
572     PetscMPIInt j = pa[i];
573     w1[j] += w2[j] + 2 * w3[j];
574     msz += w1[j];
575   }
576 
577   /* Determine the number of messages to expect, their lengths, from from-ids */
578   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
579   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
580 
581   /* Now post the Irecvs corresponding to these messages */
582   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
583 
584   /* Allocate Memory for outgoing messages */
585   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
586   PetscCall(PetscArrayzero(outdat, size));
587   PetscCall(PetscArrayzero(ptr, size));
588 
589   {
590     PetscInt *iptr = tmp, ict = 0;
591     for (PetscMPIInt i = 0; i < nrqs; i++) {
592       PetscMPIInt j = pa[i];
593       iptr += ict;
594       outdat[j] = iptr;
595       ict       = w1[j];
596     }
597   }
598 
599   /* Form the outgoing messages */
600   /* plug in the headers */
601   for (PetscMPIInt i = 0; i < nrqs; i++) {
602     PetscMPIInt j = pa[i];
603     outdat[j][0]  = 0;
604     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
605     ptr[j] = outdat[j] + 2 * w3[j] + 1;
606   }
607 
608   /* Memory for doing local proc's work */
609   {
610     PetscInt M_BPB_imax = 0;
611 #if defined(PETSC_USE_CTABLE)
612     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
613     PetscCall(PetscMalloc1(imax, &table_data));
614     for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
615     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
616     for (PetscInt i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
617 #else
618     PetscInt Mimax = 0;
619     PetscCall(PetscIntMultError(M, imax, &Mimax));
620     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
621     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
622     for (PetscInt i = 0; i < imax; i++) {
623       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
624       data[i]  = d_p + M * i;
625     }
626 #endif
627   }
628 
629   /* Parse the IS and update local tables and the outgoing buf with the data */
630   {
631     PetscInt n_i, isz_i, *outdat_j, ctr_j;
632     PetscBT  table_i;
633 
634     for (PetscInt i = 0; i < imax; i++) {
635       PetscCall(PetscArrayzero(ctr, size));
636       n_i     = n[i];
637       table_i = table[i];
638       idx_i   = idx[i];
639 #if defined(PETSC_USE_CTABLE)
640       table_data_i = table_data[i];
641 #else
642       data_i = data[i];
643 #endif
644       isz_i = isz[i];
645       for (PetscInt j = 0; j < n_i; j++) { /* parse the indices of each IS */
646         row = idx_i[j];
647         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
648         if (proc != rank) { /* copy to the outgoing buffer */
649           ctr[proc]++;
650           *ptr[proc] = row;
651           ptr[proc]++;
652         } else if (!PetscBTLookupSet(table_i, row)) {
653 #if defined(PETSC_USE_CTABLE)
654           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
655 #else
656           data_i[isz_i] = row; /* Update the local table */
657 #endif
658           isz_i++;
659         }
660       }
661       /* Update the headers for the current IS */
662       for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
663         if ((ctr_j = ctr[j])) {
664           outdat_j            = outdat[j];
665           k                   = ++outdat_j[0];
666           outdat_j[2 * k]     = ctr_j;
667           outdat_j[2 * k - 1] = i;
668         }
669       }
670       isz[i] = isz_i;
671     }
672   }
673 
674   /*  Now  post the sends */
675   PetscCall(PetscMalloc1(nrqs, &s_waits1));
676   for (PetscMPIInt i = 0; i < nrqs; ++i) {
677     PetscMPIInt j = pa[i];
678     PetscCallMPI(MPIU_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
679   }
680 
681   /* No longer need the original indices */
682   PetscCall(PetscMalloc1(imax, &iscomms));
683   for (PetscInt i = 0; i < imax; ++i) {
684     PetscCall(ISRestoreIndices(is[i], idx + i));
685     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
686   }
687   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
688 
689   for (PetscInt i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
690 
691     /* Do Local work */
692 #if defined(PETSC_USE_CTABLE)
693   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
694 #else
695   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
696 #endif
697 
698   /* Receive messages */
699   PetscCall(PetscMalloc1(nrqr, &recv_status));
700   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
701   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
702 
703   /* Phase 1 sends are complete - deallocate buffers */
704   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
705   PetscCall(PetscFree4(w1, w2, w3, w4));
706 
707   PetscCall(PetscMalloc1(nrqr, &xdata));
708   PetscCall(PetscMalloc1(nrqr, &isz1));
709   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
710   PetscCall(PetscFree(rbuf[0]));
711   PetscCall(PetscFree(rbuf));
712 
713   /* Send the data back */
714   /* Do a global reduction to know the buffer space req for incoming messages */
715   {
716     PetscMPIInt *rw1;
717 
718     PetscCall(PetscCalloc1(size, &rw1));
719     for (PetscMPIInt i = 0; i < nrqr; ++i) {
720       proc = recv_status[i].MPI_SOURCE;
721       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
722       PetscCall(PetscMPIIntCast(isz1[i], &rw1[proc]));
723     }
724     PetscCall(PetscFree(onodes1));
725     PetscCall(PetscFree(olengths1));
726 
727     /* Determine the number of messages to expect, their lengths, from from-ids */
728     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
729     PetscCall(PetscFree(rw1));
730   }
731   /* Now post the Irecvs corresponding to these messages */
732   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
733 
734   /* Now  post the sends */
735   PetscCall(PetscMalloc1(nrqr, &s_waits2));
736   for (PetscMPIInt i = 0; i < nrqr; ++i) { PetscCallMPI(MPIU_Isend(xdata[i], isz1[i], MPIU_INT, recv_status[i].MPI_SOURCE, tag2, comm, s_waits2 + i)); }
737 
738   /* receive work done on other processors */
739   {
740     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
741     PetscMPIInt idex;
742     PetscBT     table_i;
743 
744     for (PetscMPIInt i = 0; i < nrqs; ++i) {
745       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
746       /* Process the message */
747       rbuf2_i = rbuf2[idex];
748       ct1     = 2 * rbuf2_i[0] + 1;
749       jmax    = rbuf2[idex][0];
750       for (PetscInt j = 1; j <= jmax; j++) {
751         max     = rbuf2_i[2 * j];
752         is_no   = rbuf2_i[2 * j - 1];
753         isz_i   = isz[is_no];
754         table_i = table[is_no];
755 #if defined(PETSC_USE_CTABLE)
756         table_data_i = table_data[is_no];
757 #else
758         data_i = data[is_no];
759 #endif
760         for (k = 0; k < max; k++, ct1++) {
761           row = rbuf2_i[ct1];
762           if (!PetscBTLookupSet(table_i, row)) {
763 #if defined(PETSC_USE_CTABLE)
764             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
765 #else
766             data_i[isz_i] = row;
767 #endif
768             isz_i++;
769           }
770         }
771         isz[is_no] = isz_i;
772       }
773     }
774 
775     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
776   }
777 
778 #if defined(PETSC_USE_CTABLE)
779   tcount_max = 0;
780   for (PetscInt i = 0; i < imax; ++i) {
781     table_data_i = table_data[i];
782     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
783     if (tcount_max < tcount) tcount_max = tcount;
784   }
785   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
786 #endif
787 
788   for (PetscInt i = 0; i < imax; ++i) {
789 #if defined(PETSC_USE_CTABLE)
790     PetscHashIter tpos;
791     PetscInt      j;
792 
793     table_data_i = table_data[i];
794     PetscHashIterBegin(table_data_i, tpos);
795     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
796       PetscHashIterGetKey(table_data_i, tpos, k);
797       PetscHashIterGetVal(table_data_i, tpos, j);
798       PetscHashIterNext(table_data_i, tpos);
799       tdata[--j] = --k;
800     }
801     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
802 #else
803     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
804 #endif
805     PetscCall(PetscCommDestroy(&iscomms[i]));
806   }
807 
808   PetscCall(PetscFree(iscomms));
809   PetscCall(PetscFree(onodes2));
810   PetscCall(PetscFree(olengths2));
811 
812   PetscCall(PetscFree(pa));
813   PetscCall(PetscFree(rbuf2[0]));
814   PetscCall(PetscFree(rbuf2));
815   PetscCall(PetscFree(s_waits1));
816   PetscCall(PetscFree(r_waits1));
817   PetscCall(PetscFree(s_waits2));
818   PetscCall(PetscFree(r_waits2));
819   PetscCall(PetscFree(recv_status));
820   if (xdata) {
821     PetscCall(PetscFree(xdata[0]));
822     PetscCall(PetscFree(xdata));
823   }
824   PetscCall(PetscFree(isz1));
825 #if defined(PETSC_USE_CTABLE)
826   for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
827   PetscCall(PetscFree(table_data));
828   PetscCall(PetscFree(tdata));
829   PetscCall(PetscFree4(table, data, isz, t_p));
830 #else
831   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
832 #endif
833   PetscFunctionReturn(PETSC_SUCCESS);
834 }
835 
836 /*
837    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
838        the work on the local processor.
839 
840      Inputs:
841       C      - MAT_MPIAIJ;
842       imax - total no of index sets processed at a time;
843       table  - an array of char - size = m bits.
844 
845      Output:
846       isz    - array containing the count of the solution elements corresponding
847                to each index set;
848       data or table_data  - pointer to the solutions
849 */
850 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
851 {
852   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
853   Mat         A = c->A, B = c->B;
854   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
855   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
856   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
857   PetscBT     table_i;
858 #if defined(PETSC_USE_CTABLE)
859   PetscHMapI    table_data_i;
860   PetscHashIter tpos;
861   PetscInt      tcount, *tdata;
862 #else
863   PetscInt *data_i;
864 #endif
865 
866   PetscFunctionBegin;
867   rstart = C->rmap->rstart;
868   cstart = C->cmap->rstart;
869   ai     = a->i;
870   aj     = a->j;
871   bi     = b->i;
872   bj     = b->j;
873   garray = c->garray;
874 
875   for (i = 0; i < imax; i++) {
876 #if defined(PETSC_USE_CTABLE)
877     /* copy existing entries of table_data_i into tdata[] */
878     table_data_i = table_data[i];
879     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
880     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
881 
882     PetscCall(PetscMalloc1(tcount, &tdata));
883     PetscHashIterBegin(table_data_i, tpos);
884     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
885       PetscHashIterGetKey(table_data_i, tpos, row);
886       PetscHashIterGetVal(table_data_i, tpos, j);
887       PetscHashIterNext(table_data_i, tpos);
888       tdata[--j] = --row;
889       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
890     }
891 #else
892     data_i = data[i];
893 #endif
894     table_i = table[i];
895     isz_i   = isz[i];
896     max     = isz[i];
897 
898     for (j = 0; j < max; j++) {
899 #if defined(PETSC_USE_CTABLE)
900       row = tdata[j] - rstart;
901 #else
902       row = data_i[j] - rstart;
903 #endif
904       start = ai[row];
905       end   = ai[row + 1];
906       for (k = start; k < end; k++) { /* Amat */
907         val = aj[k] + cstart;
908         if (!PetscBTLookupSet(table_i, val)) {
909 #if defined(PETSC_USE_CTABLE)
910           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
911 #else
912           data_i[isz_i] = val;
913 #endif
914           isz_i++;
915         }
916       }
917       start = bi[row];
918       end   = bi[row + 1];
919       for (k = start; k < end; k++) { /* Bmat */
920         val = garray[bj[k]];
921         if (!PetscBTLookupSet(table_i, val)) {
922 #if defined(PETSC_USE_CTABLE)
923           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
924 #else
925           data_i[isz_i] = val;
926 #endif
927           isz_i++;
928         }
929       }
930     }
931     isz[i] = isz_i;
932 
933 #if defined(PETSC_USE_CTABLE)
934     PetscCall(PetscFree(tdata));
935 #endif
936   }
937   PetscFunctionReturn(PETSC_SUCCESS);
938 }
939 
940 /*
941       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
942          and return the output
943 
944          Input:
945            C    - the matrix
946            nrqr - no of messages being processed.
947            rbuf - an array of pointers to the received requests
948 
949          Output:
950            xdata - array of messages to be sent back
951            isz1  - size of each message
952 
953   For better efficiency perhaps we should malloc separately each xdata[i],
954 then if a remalloc is required we need only copy the data for that one row
955 rather than all previous rows as it is now where a single large chunk of
956 memory is used.
957 
958 */
959 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
960 {
961   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
962   Mat         A = c->A, B = c->B;
963   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
964   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
965   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
966   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
967   PetscInt   *rbuf_i, kmax, rbuf_0;
968   PetscBT     xtable;
969 
970   PetscFunctionBegin;
971   m      = C->rmap->N;
972   rstart = C->rmap->rstart;
973   cstart = C->cmap->rstart;
974   ai     = a->i;
975   aj     = a->j;
976   bi     = b->i;
977   bj     = b->j;
978   garray = c->garray;
979 
980   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
981     rbuf_i = rbuf[i];
982     rbuf_0 = rbuf_i[0];
983     ct += rbuf_0;
984     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
985   }
986 
987   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
988   else max1 = 1;
989   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
990   if (nrqr) {
991     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
992     ++no_malloc;
993   }
994   PetscCall(PetscBTCreate(m, &xtable));
995   PetscCall(PetscArrayzero(isz1, nrqr));
996 
997   ct3 = 0;
998   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
999     rbuf_i = rbuf[i];
1000     rbuf_0 = rbuf_i[0];
1001     ct1    = 2 * rbuf_0 + 1;
1002     ct2    = ct1;
1003     ct3 += ct1;
1004     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1005       PetscCall(PetscBTMemzero(m, xtable));
1006       oct2 = ct2;
1007       kmax = rbuf_i[2 * j];
1008       for (k = 0; k < kmax; k++, ct1++) {
1009         row = rbuf_i[ct1];
1010         if (!PetscBTLookupSet(xtable, row)) {
1011           if (!(ct3 < mem_estimate)) {
1012             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1013             PetscCall(PetscMalloc1(new_estimate, &tmp));
1014             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1015             PetscCall(PetscFree(xdata[0]));
1016             xdata[0]     = tmp;
1017             mem_estimate = new_estimate;
1018             ++no_malloc;
1019             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1020           }
1021           xdata[i][ct2++] = row;
1022           ct3++;
1023         }
1024       }
1025       for (k = oct2, max2 = ct2; k < max2; k++) {
1026         row   = xdata[i][k] - rstart;
1027         start = ai[row];
1028         end   = ai[row + 1];
1029         for (l = start; l < end; l++) {
1030           val = aj[l] + cstart;
1031           if (!PetscBTLookupSet(xtable, val)) {
1032             if (!(ct3 < mem_estimate)) {
1033               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1034               PetscCall(PetscMalloc1(new_estimate, &tmp));
1035               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1036               PetscCall(PetscFree(xdata[0]));
1037               xdata[0]     = tmp;
1038               mem_estimate = new_estimate;
1039               ++no_malloc;
1040               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1041             }
1042             xdata[i][ct2++] = val;
1043             ct3++;
1044           }
1045         }
1046         start = bi[row];
1047         end   = bi[row + 1];
1048         for (l = start; l < end; l++) {
1049           val = garray[bj[l]];
1050           if (!PetscBTLookupSet(xtable, val)) {
1051             if (!(ct3 < mem_estimate)) {
1052               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1053               PetscCall(PetscMalloc1(new_estimate, &tmp));
1054               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1055               PetscCall(PetscFree(xdata[0]));
1056               xdata[0]     = tmp;
1057               mem_estimate = new_estimate;
1058               ++no_malloc;
1059               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1060             }
1061             xdata[i][ct2++] = val;
1062             ct3++;
1063           }
1064         }
1065       }
1066       /* Update the header*/
1067       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1068       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1069     }
1070     xdata[i][0] = rbuf_0;
1071     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1072     isz1[i] = ct2; /* size of each message */
1073   }
1074   PetscCall(PetscBTDestroy(&xtable));
1075   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
1079 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1080 /*
1081     Every processor gets the entire matrix
1082 */
1083 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1084 {
1085   Mat         B;
1086   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1087   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1088   PetscMPIInt size, rank;
1089   PetscInt    sendcount, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0;
1090   PetscInt    m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1091 
1092   PetscFunctionBegin;
1093   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1094   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1095   if (scall == MAT_INITIAL_MATRIX) {
1096     /* Tell every processor the number of nonzeros per row */
1097     PetscCall(PetscCalloc1(A->rmap->N, &lens));
1098     for (PetscInt i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1099 
1100     /* All MPI processes get the same matrix */
1101     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1102 
1103     /*     Create the sequential matrix of the same type as the local block diagonal  */
1104     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1105     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1106     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1107     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1108     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1109     PetscCall(PetscCalloc1(2, Bin));
1110     **Bin = B;
1111     b     = (Mat_SeqAIJ *)B->data;
1112 
1113     /* zero column space */
1114     nrecv = 0;
1115     for (PetscMPIInt i = 0; i < size; i++) {
1116       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j];
1117     }
1118     PetscCall(PetscArrayzero(b->j, nrecv));
1119 
1120     /*   Copy my part of matrix column indices over    */
1121     sendcount  = ad->nz + bd->nz;
1122     jsendbuf   = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]);
1123     a_jsendbuf = ad->j;
1124     b_jsendbuf = bd->j;
1125     n          = A->rmap->rend - A->rmap->rstart;
1126     cnt        = 0;
1127     for (PetscInt i = 0; i < n; i++) {
1128       /* put in lower diagonal portion */
1129       m = bd->i[i + 1] - bd->i[i];
1130       while (m > 0) {
1131         /* is it above diagonal (in bd (compressed) numbering) */
1132         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1133         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1134         m--;
1135       }
1136 
1137       /* put in diagonal portion */
1138       for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1139 
1140       /* put in upper diagonal portion */
1141       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1142     }
1143     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1144 
1145     /* send column indices, b->j was zeroed */
1146     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1147 
1148     /*  Assemble the matrix into useable form (numerical values not yet set) */
1149     /* set the b->ilen (length of each row) values */
1150     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1151     /* set the b->i indices */
1152     b->i[0] = 0;
1153     for (PetscInt i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1154     PetscCall(PetscFree(lens));
1155     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1156     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1157 
1158   } else {
1159     B = **Bin;
1160     b = (Mat_SeqAIJ *)B->data;
1161   }
1162 
1163   /* Copy my part of matrix numerical values into the values location */
1164   if (flag == MAT_GET_VALUES) {
1165     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1166     MatScalar         *sendbuf;
1167 
1168     /* initialize b->a */
1169     PetscCall(PetscArrayzero(b->a, b->nz));
1170 
1171     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1172     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1173     sendcount = ad->nz + bd->nz;
1174     sendbuf   = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]);
1175     a_sendbuf = ada;
1176     b_sendbuf = bda;
1177     b_sendj   = bd->j;
1178     n         = A->rmap->rend - A->rmap->rstart;
1179     cnt       = 0;
1180     for (PetscInt i = 0; i < n; i++) {
1181       /* put in lower diagonal portion */
1182       m = bd->i[i + 1] - bd->i[i];
1183       while (m > 0) {
1184         /* is it above diagonal (in bd (compressed) numbering) */
1185         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         m--;
1188         b_sendj++;
1189       }
1190 
1191       /* put in diagonal portion */
1192       for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1193 
1194       /* put in upper diagonal portion */
1195       while (m-- > 0) {
1196         sendbuf[cnt++] = *b_sendbuf++;
1197         b_sendj++;
1198       }
1199     }
1200     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1201 
1202     /* send values, b->a was zeroed */
1203     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1204 
1205     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1206     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1207   } /* endof (flag == MAT_GET_VALUES) */
1208 
1209   PetscCall(MatPropagateSymmetryOptions(A, B));
1210   PetscFunctionReturn(PETSC_SUCCESS);
1211 }
1212 
1213 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1214 {
1215   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1216   Mat             submat, A = c->A, B = c->B;
1217   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1218   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1219   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1220   const PetscInt *icol, *irow;
1221   PetscInt        nrow, ncol, start;
1222   PetscMPIInt     rank, size, *req_source1, *req_source2, tag1, tag2, tag3, tag4, *w1, *w2, nrqr, nrqs = 0, proc, *pa;
1223   PetscInt      **sbuf1, **sbuf2, k, ct1, ct2, ct3, **rbuf1, row;
1224   PetscInt        msz, **ptr, *req_size, *ctr, *tmp, tcol, *iptr;
1225   PetscInt      **rbuf3, **sbuf_aj, **rbuf2, max1, nnz;
1226   PetscInt       *lens, rmax, ncols, *cols, Crow;
1227 #if defined(PETSC_USE_CTABLE)
1228   PetscHMapI cmap, rmap;
1229   PetscInt  *cmap_loc, *rmap_loc;
1230 #else
1231   PetscInt *cmap, *rmap;
1232 #endif
1233   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i, jcnt;
1234   PetscInt     *cworkB, lwrite, *subcols, ib, jb;
1235   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1236   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1237   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1238   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1239   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1240   MPI_Comm      comm;
1241   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1242   PetscMPIInt  *onodes1, *olengths1, idex, end, *row2proc;
1243   Mat_SubSppt  *smatis1;
1244   PetscBool     isrowsorted, iscolsorted;
1245 
1246   PetscFunctionBegin;
1247   PetscValidLogicalCollectiveInt(C, ismax, 2);
1248   PetscValidLogicalCollectiveEnum(C, scall, 5);
1249   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1250   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1251   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1252   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1253   size = c->size;
1254   rank = c->rank;
1255 
1256   PetscCall(ISSorted(iscol[0], &iscolsorted));
1257   PetscCall(ISSorted(isrow[0], &isrowsorted));
1258   PetscCall(ISGetIndices(isrow[0], &irow));
1259   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1260   if (allcolumns) {
1261     icol = NULL;
1262     ncol = C->cmap->N;
1263   } else {
1264     PetscCall(ISGetIndices(iscol[0], &icol));
1265     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1266   }
1267 
1268   if (scall == MAT_INITIAL_MATRIX) {
1269     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1270 
1271     /* Get some new tags to keep the communication clean */
1272     tag1 = ((PetscObject)C)->tag;
1273     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1274     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1275 
1276     /* evaluate communication - mesg to who, length of mesg, and buffer space
1277      required. Based on this, buffers are allocated, and data copied into them */
1278     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1279     PetscCall(PetscMalloc1(nrow, &row2proc));
1280 
1281     /* w1[proc] = num of rows owned by proc -- to be requested */
1282     proc = 0;
1283     nrqs = 0; /* num of outgoing messages */
1284     for (PetscInt j = 0; j < nrow; j++) {
1285       row = irow[j];
1286       if (!isrowsorted) proc = 0;
1287       while (row >= C->rmap->range[proc + 1]) proc++;
1288       w1[proc]++;
1289       row2proc[j] = proc; /* map row index to proc */
1290 
1291       if (proc != rank && !w2[proc]) {
1292         w2[proc] = 1;
1293         nrqs++;
1294       }
1295     }
1296     w1[rank] = 0; /* rows owned by self will not be requested */
1297 
1298     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1299     for (PetscMPIInt proc = 0, j = 0; proc < size; proc++) {
1300       if (w1[proc]) pa[j++] = proc;
1301     }
1302 
1303     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1304     msz = 0; /* total mesg length (for all procs) */
1305     for (PetscMPIInt i = 0; i < nrqs; i++) {
1306       proc = pa[i];
1307       w1[proc] += 3;
1308       msz += w1[proc];
1309     }
1310     PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));
1311 
1312     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1313     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1314     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1315 
1316     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1317        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1318     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1319 
1320     /* Now post the Irecvs corresponding to these messages */
1321     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1322 
1323     PetscCall(PetscFree(onodes1));
1324     PetscCall(PetscFree(olengths1));
1325 
1326     /* Allocate Memory for outgoing messages */
1327     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1328     PetscCall(PetscArrayzero(sbuf1, size));
1329     PetscCall(PetscArrayzero(ptr, size));
1330 
1331     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1332     iptr = tmp;
1333     for (PetscMPIInt i = 0; i < nrqs; i++) {
1334       proc        = pa[i];
1335       sbuf1[proc] = iptr;
1336       iptr += w1[proc];
1337     }
1338 
1339     /* Form the outgoing messages */
1340     /* Initialize the header space */
1341     for (PetscMPIInt i = 0; i < nrqs; i++) {
1342       proc = pa[i];
1343       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1344       ptr[proc] = sbuf1[proc] + 3;
1345     }
1346 
1347     /* Parse the isrow and copy data into outbuf */
1348     PetscCall(PetscArrayzero(ctr, size));
1349     for (PetscInt j = 0; j < nrow; j++) { /* parse the indices of each IS */
1350       proc = row2proc[j];
1351       if (proc != rank) { /* copy to the outgoing buf*/
1352         *ptr[proc] = irow[j];
1353         ctr[proc]++;
1354         ptr[proc]++;
1355       }
1356     }
1357 
1358     /* Update the headers for the current IS */
1359     for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
1360       if ((ctr_j = ctr[j])) {
1361         sbuf1_j            = sbuf1[j];
1362         k                  = ++sbuf1_j[0];
1363         sbuf1_j[2 * k]     = ctr_j;
1364         sbuf1_j[2 * k - 1] = 0;
1365       }
1366     }
1367 
1368     /* Now post the sends */
1369     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1370     for (PetscMPIInt i = 0; i < nrqs; ++i) {
1371       proc = pa[i];
1372       PetscCallMPI(MPIU_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1373     }
1374 
1375     /* Post Receives to capture the buffer size */
1376     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1377     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1378 
1379     if (nrqs) rbuf2[0] = tmp + msz;
1380     for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1381 
1382     for (PetscMPIInt i = 0; i < nrqs; ++i) {
1383       proc = pa[i];
1384       PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1385     }
1386 
1387     PetscCall(PetscFree2(w1, w2));
1388 
1389     /* Send to other procs the buf size they should allocate */
1390     /* Receive messages*/
1391     PetscCall(PetscMalloc1(nrqr, &r_status1));
1392     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1393 
1394     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1395     for (PetscMPIInt i = 0; i < nrqr; ++i) {
1396       req_size[i] = 0;
1397       rbuf1_i     = rbuf1[i];
1398       start       = 2 * rbuf1_i[0] + 1;
1399       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1400       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1401       sbuf2_i = sbuf2[i];
1402       for (PetscInt j = start; j < end; j++) {
1403         k          = rbuf1_i[j] - rstart;
1404         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1405         sbuf2_i[j] = ncols;
1406         req_size[i] += ncols;
1407       }
1408       req_source1[i] = r_status1[i].MPI_SOURCE;
1409 
1410       /* form the header */
1411       sbuf2_i[0] = req_size[i];
1412       for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1413 
1414       PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1415     }
1416 
1417     PetscCall(PetscFree(r_status1));
1418     PetscCall(PetscFree(r_waits1));
1419 
1420     /* rbuf2 is received, Post recv column indices a->j */
1421     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1422 
1423     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1424     for (PetscMPIInt i = 0; i < nrqs; ++i) {
1425       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1426       req_source2[i] = r_status2[i].MPI_SOURCE;
1427       PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1428     }
1429 
1430     /* Wait on sends1 and sends2 */
1431     PetscCall(PetscMalloc1(nrqs, &s_status1));
1432     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1433     PetscCall(PetscFree(s_waits1));
1434     PetscCall(PetscFree(s_status1));
1435 
1436     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1437     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1438 
1439     /* Now allocate sending buffers for a->j, and send them off */
1440     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1441     jcnt = 0;
1442     for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1443     if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
1444     for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1445 
1446     for (PetscMPIInt i = 0; i < nrqr; i++) { /* for each requested message */
1447       rbuf1_i   = rbuf1[i];
1448       sbuf_aj_i = sbuf_aj[i];
1449       ct1       = 2 * rbuf1_i[0] + 1;
1450       ct2       = 0;
1451       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1452 
1453       kmax = rbuf1[i][2];
1454       for (PetscInt k = 0; k < kmax; k++, ct1++) { /* for each row */
1455         row    = rbuf1_i[ct1] - rstart;
1456         nzA    = ai[row + 1] - ai[row];
1457         nzB    = bi[row + 1] - bi[row];
1458         ncols  = nzA + nzB;
1459         cworkA = PetscSafePointerPlusOffset(aj, ai[row]);
1460         cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1461 
1462         /* load the column indices for this row into cols*/
1463         cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2);
1464 
1465         lwrite = 0;
1466         for (PetscInt l = 0; l < nzB; l++) {
1467           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1468         }
1469         for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1470         for (PetscInt l = 0; l < nzB; l++) {
1471           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1472         }
1473 
1474         ct2 += ncols;
1475       }
1476       PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1477     }
1478 
1479     /* create column map (cmap): global col of C -> local col of submat */
1480 #if defined(PETSC_USE_CTABLE)
1481     if (!allcolumns) {
1482       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1483       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1484       for (PetscInt j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1485         if (icol[j] >= cstart && icol[j] < cend) {
1486           cmap_loc[icol[j] - cstart] = j + 1;
1487         } else { /* use PetscHMapI for non-local col indices */
1488           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1489         }
1490       }
1491     } else {
1492       cmap     = NULL;
1493       cmap_loc = NULL;
1494     }
1495     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1496 #else
1497     if (!allcolumns) {
1498       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1499       for (PetscInt j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1500     } else {
1501       cmap = NULL;
1502     }
1503 #endif
1504 
1505     /* Create lens for MatSeqAIJSetPreallocation() */
1506     PetscCall(PetscCalloc1(nrow, &lens));
1507 
1508     /* Compute lens from local part of C */
1509     for (PetscInt j = 0; j < nrow; j++) {
1510       row  = irow[j];
1511       proc = row2proc[j];
1512       if (proc == rank) {
1513         /* diagonal part A = c->A */
1514         ncols = ai[row - rstart + 1] - ai[row - rstart];
1515         cols  = PetscSafePointerPlusOffset(aj, ai[row - rstart]);
1516         if (!allcolumns) {
1517           for (PetscInt k = 0; k < ncols; k++) {
1518 #if defined(PETSC_USE_CTABLE)
1519             tcol = cmap_loc[cols[k]];
1520 #else
1521             tcol = cmap[cols[k] + cstart];
1522 #endif
1523             if (tcol) lens[j]++;
1524           }
1525         } else { /* allcolumns */
1526           lens[j] = ncols;
1527         }
1528 
1529         /* off-diagonal part B = c->B */
1530         ncols = bi[row - rstart + 1] - bi[row - rstart];
1531         cols  = PetscSafePointerPlusOffset(bj, bi[row - rstart]);
1532         if (!allcolumns) {
1533           for (PetscInt k = 0; k < ncols; k++) {
1534 #if defined(PETSC_USE_CTABLE)
1535             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1536 #else
1537             tcol = cmap[bmap[cols[k]]];
1538 #endif
1539             if (tcol) lens[j]++;
1540           }
1541         } else { /* allcolumns */
1542           lens[j] += ncols;
1543         }
1544       }
1545     }
1546 
1547     /* Create row map (rmap): global row of C -> local row of submat */
1548 #if defined(PETSC_USE_CTABLE)
1549     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1550     for (PetscInt j = 0; j < nrow; j++) {
1551       row  = irow[j];
1552       proc = row2proc[j];
1553       if (proc == rank) { /* a local row */
1554         rmap_loc[row - rstart] = j;
1555       } else {
1556         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1557       }
1558     }
1559 #else
1560     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1561     for (PetscInt j = 0; j < nrow; j++) rmap[irow[j]] = j;
1562 #endif
1563 
1564     /* Update lens from offproc data */
1565     /* recv a->j is done */
1566     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1567     for (PetscMPIInt i = 0; i < nrqs; i++) {
1568       proc    = pa[i];
1569       sbuf1_i = sbuf1[proc];
1570       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1571       ct1     = 2 + 1;
1572       ct2     = 0;
1573       rbuf2_i = rbuf2[i]; /* received length of C->j */
1574       rbuf3_i = rbuf3[i]; /* received C->j */
1575 
1576       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1577       max1 = sbuf1_i[2];
1578       for (PetscInt k = 0; k < max1; k++, ct1++) {
1579 #if defined(PETSC_USE_CTABLE)
1580         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1581         row--;
1582         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1583 #else
1584         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1585 #endif
1586         /* Now, store row index of submat in sbuf1_i[ct1] */
1587         sbuf1_i[ct1] = row;
1588 
1589         nnz = rbuf2_i[ct1];
1590         if (!allcolumns) {
1591           for (PetscMPIInt l = 0; l < nnz; l++, ct2++) {
1592 #if defined(PETSC_USE_CTABLE)
1593             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1594               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1595             } else {
1596               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1597             }
1598 #else
1599             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1600 #endif
1601             if (tcol) lens[row]++;
1602           }
1603         } else { /* allcolumns */
1604           lens[row] += nnz;
1605         }
1606       }
1607     }
1608     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1609     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1610 
1611     /* Create the submatrices */
1612     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1613     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1614 
1615     PetscCall(ISGetBlockSize(isrow[0], &ib));
1616     PetscCall(ISGetBlockSize(iscol[0], &jb));
1617     if (ib > 1 || jb > 1) PetscCall(MatSetBlockSizes(submat, ib, jb));
1618     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1619     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1620 
1621     /* create struct Mat_SubSppt and attached it to submat */
1622     PetscCall(PetscNew(&smatis1));
1623     subc            = (Mat_SeqAIJ *)submat->data;
1624     subc->submatis1 = smatis1;
1625 
1626     smatis1->id          = 0;
1627     smatis1->nrqs        = nrqs;
1628     smatis1->nrqr        = nrqr;
1629     smatis1->rbuf1       = rbuf1;
1630     smatis1->rbuf2       = rbuf2;
1631     smatis1->rbuf3       = rbuf3;
1632     smatis1->sbuf2       = sbuf2;
1633     smatis1->req_source2 = req_source2;
1634 
1635     smatis1->sbuf1 = sbuf1;
1636     smatis1->ptr   = ptr;
1637     smatis1->tmp   = tmp;
1638     smatis1->ctr   = ctr;
1639 
1640     smatis1->pa          = pa;
1641     smatis1->req_size    = req_size;
1642     smatis1->req_source1 = req_source1;
1643 
1644     smatis1->allcolumns = allcolumns;
1645     smatis1->singleis   = PETSC_TRUE;
1646     smatis1->row2proc   = row2proc;
1647     smatis1->rmap       = rmap;
1648     smatis1->cmap       = cmap;
1649 #if defined(PETSC_USE_CTABLE)
1650     smatis1->rmap_loc = rmap_loc;
1651     smatis1->cmap_loc = cmap_loc;
1652 #endif
1653 
1654     smatis1->destroy     = submat->ops->destroy;
1655     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1656     submat->factortype   = C->factortype;
1657 
1658     /* compute rmax */
1659     rmax = 0;
1660     for (PetscMPIInt i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1661 
1662   } else { /* scall == MAT_REUSE_MATRIX */
1663     submat = submats[0];
1664     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1665 
1666     subc        = (Mat_SeqAIJ *)submat->data;
1667     rmax        = subc->rmax;
1668     smatis1     = subc->submatis1;
1669     nrqs        = smatis1->nrqs;
1670     nrqr        = smatis1->nrqr;
1671     rbuf1       = smatis1->rbuf1;
1672     rbuf2       = smatis1->rbuf2;
1673     rbuf3       = smatis1->rbuf3;
1674     req_source2 = smatis1->req_source2;
1675 
1676     sbuf1 = smatis1->sbuf1;
1677     sbuf2 = smatis1->sbuf2;
1678     ptr   = smatis1->ptr;
1679     tmp   = smatis1->tmp;
1680     ctr   = smatis1->ctr;
1681 
1682     pa          = smatis1->pa;
1683     req_size    = smatis1->req_size;
1684     req_source1 = smatis1->req_source1;
1685 
1686     allcolumns = smatis1->allcolumns;
1687     row2proc   = smatis1->row2proc;
1688     rmap       = smatis1->rmap;
1689     cmap       = smatis1->cmap;
1690 #if defined(PETSC_USE_CTABLE)
1691     rmap_loc = smatis1->rmap_loc;
1692     cmap_loc = smatis1->cmap_loc;
1693 #endif
1694   }
1695 
1696   /* Post recv matrix values */
1697   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1698   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1699   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1700   for (PetscMPIInt i = 0; i < nrqs; ++i) {
1701     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1702     PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1703   }
1704 
1705   /* Allocate sending buffers for a->a, and send them off */
1706   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1707   jcnt = 0;
1708   for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1709   if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
1710   for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1711 
1712   for (PetscMPIInt i = 0; i < nrqr; i++) {
1713     rbuf1_i   = rbuf1[i];
1714     sbuf_aa_i = sbuf_aa[i];
1715     ct1       = 2 * rbuf1_i[0] + 1;
1716     ct2       = 0;
1717     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1718 
1719     kmax = rbuf1_i[2];
1720     for (PetscInt k = 0; k < kmax; k++, ct1++) {
1721       row    = rbuf1_i[ct1] - rstart;
1722       nzA    = ai[row + 1] - ai[row];
1723       nzB    = bi[row + 1] - bi[row];
1724       ncols  = nzA + nzB;
1725       cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1726       vworkA = PetscSafePointerPlusOffset(a_a, ai[row]);
1727       vworkB = PetscSafePointerPlusOffset(b_a, bi[row]);
1728 
1729       /* load the column values for this row into vals*/
1730       vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2);
1731 
1732       lwrite = 0;
1733       for (PetscInt l = 0; l < nzB; l++) {
1734         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1735       }
1736       for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1737       for (PetscInt l = 0; l < nzB; l++) {
1738         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1739       }
1740 
1741       ct2 += ncols;
1742     }
1743     PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1744   }
1745 
1746   /* Assemble submat */
1747   /* First assemble the local rows */
1748   for (PetscInt j = 0; j < nrow; j++) {
1749     row  = irow[j];
1750     proc = row2proc[j];
1751     if (proc == rank) {
1752       Crow = row - rstart; /* local row index of C */
1753 #if defined(PETSC_USE_CTABLE)
1754       row = rmap_loc[Crow]; /* row index of submat */
1755 #else
1756       row = rmap[row];
1757 #endif
1758 
1759       if (allcolumns) {
1760         PetscInt ncol = 0;
1761 
1762         /* diagonal part A = c->A */
1763         ncols = ai[Crow + 1] - ai[Crow];
1764         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1765         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1766         for (PetscInt k = 0; k < ncols; k++) {
1767           subcols[ncol]   = cols[k] + cstart;
1768           subvals[ncol++] = vals[k];
1769         }
1770 
1771         /* off-diagonal part B = c->B */
1772         ncols = bi[Crow + 1] - bi[Crow];
1773         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1774         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1775         for (PetscInt k = 0; k < ncols; k++) {
1776           subcols[ncol]   = bmap[cols[k]];
1777           subvals[ncol++] = vals[k];
1778         }
1779 
1780         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));
1781 
1782       } else { /* !allcolumns */
1783         PetscInt ncol = 0;
1784 
1785 #if defined(PETSC_USE_CTABLE)
1786         /* diagonal part A = c->A */
1787         ncols = ai[Crow + 1] - ai[Crow];
1788         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1789         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1790         for (PetscInt k = 0; k < ncols; k++) {
1791           tcol = cmap_loc[cols[k]];
1792           if (tcol) {
1793             subcols[ncol]   = --tcol;
1794             subvals[ncol++] = vals[k];
1795           }
1796         }
1797 
1798         /* off-diagonal part B = c->B */
1799         ncols = bi[Crow + 1] - bi[Crow];
1800         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1801         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1802         for (PetscInt k = 0; k < ncols; k++) {
1803           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1804           if (tcol) {
1805             subcols[ncol]   = --tcol;
1806             subvals[ncol++] = vals[k];
1807           }
1808         }
1809 #else
1810         /* diagonal part A = c->A */
1811         ncols = ai[Crow + 1] - ai[Crow];
1812         cols  = aj + ai[Crow];
1813         vals  = a_a + ai[Crow];
1814         for (PetscInt k = 0; k < ncols; k++) {
1815           tcol = cmap[cols[k] + cstart];
1816           if (tcol) {
1817             subcols[ncol]   = --tcol;
1818             subvals[ncol++] = vals[k];
1819           }
1820         }
1821 
1822         /* off-diagonal part B = c->B */
1823         ncols = bi[Crow + 1] - bi[Crow];
1824         cols  = bj + bi[Crow];
1825         vals  = b_a + bi[Crow];
1826         for (PetscInt k = 0; k < ncols; k++) {
1827           tcol = cmap[bmap[cols[k]]];
1828           if (tcol) {
1829             subcols[ncol]   = --tcol;
1830             subvals[ncol++] = vals[k];
1831           }
1832         }
1833 #endif
1834         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));
1835       }
1836     }
1837   }
1838 
1839   /* Now assemble the off-proc rows */
1840   for (PetscMPIInt i = 0; i < nrqs; i++) { /* for each requested message */
1841     /* recv values from other processes */
1842     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1843     proc    = pa[idex];
1844     sbuf1_i = sbuf1[proc];
1845     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1846     ct1     = 2 + 1;
1847     ct2     = 0;           /* count of received C->j */
1848     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1849     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1850     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1851     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1852 
1853     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1854     max1 = sbuf1_i[2];                           /* num of rows */
1855     for (PetscInt k = 0; k < max1; k++, ct1++) { /* for each recved row */
1856       row = sbuf1_i[ct1];                        /* row index of submat */
1857       if (!allcolumns) {
1858         idex = 0;
1859         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1860           nnz = rbuf2_i[ct1];                         /* num of C entries in this row */
1861           for (PetscInt l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1862 #if defined(PETSC_USE_CTABLE)
1863             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1864               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1865             } else {
1866               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1867             }
1868 #else
1869             tcol = cmap[rbuf3_i[ct2]];
1870 #endif
1871             if (tcol) {
1872               subcols[idex]   = --tcol; /* may not be sorted */
1873               subvals[idex++] = rbuf4_i[ct2];
1874 
1875               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1876                For reuse, we replace received C->j with index that should be inserted to submat */
1877               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1878             }
1879           }
1880           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1881         } else { /* scall == MAT_REUSE_MATRIX */
1882           submat = submats[0];
1883           subc   = (Mat_SeqAIJ *)submat->data;
1884 
1885           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1886           for (PetscInt l = 0; l < nnz; l++) {
1887             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1888             subvals[idex++] = rbuf4_i[ct2];
1889           }
1890 
1891           bj = subc->j + subc->i[row]; /* sorted column indices */
1892           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1893         }
1894       } else {              /* allcolumns */
1895         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1896         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES));
1897         ct2 += nnz;
1898       }
1899     }
1900   }
1901 
1902   /* sending a->a are done */
1903   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1904   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1905 
1906   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1907   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1908   submats[0] = submat;
1909 
1910   /* Restore the indices */
1911   PetscCall(ISRestoreIndices(isrow[0], &irow));
1912   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1913 
1914   /* Destroy allocated memory */
1915   for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1916   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1917   if (sbuf_aa) {
1918     PetscCall(PetscFree(sbuf_aa[0]));
1919     PetscCall(PetscFree(sbuf_aa));
1920   }
1921 
1922   if (scall == MAT_INITIAL_MATRIX) {
1923     PetscCall(PetscFree(lens));
1924     if (sbuf_aj) {
1925       PetscCall(PetscFree(sbuf_aj[0]));
1926       PetscCall(PetscFree(sbuf_aj));
1927     }
1928   }
1929   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1930   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1931   PetscFunctionReturn(PETSC_SUCCESS);
1932 }
1933 
1934 static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1935 {
1936   PetscInt  ncol;
1937   PetscBool colflag, allcolumns = PETSC_FALSE;
1938 
1939   PetscFunctionBegin;
1940   /* Allocate memory to hold all the submatrices */
1941   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1942 
1943   /* Check for special case: each processor gets entire matrix columns */
1944   PetscCall(ISIdentity(iscol[0], &colflag));
1945   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1946   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1947 
1948   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1949   PetscFunctionReturn(PETSC_SUCCESS);
1950 }
1951 
1952 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1953 {
1954   PetscInt     nmax, nstages = 0, max_no, nrow, ncol, in[2], out[2];
1955   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1956   Mat_SeqAIJ  *subc;
1957   Mat_SubSppt *smat;
1958 
1959   PetscFunctionBegin;
1960   /* Check for special case: each processor has a single IS */
1961   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1962     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1963     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1964     PetscFunctionReturn(PETSC_SUCCESS);
1965   }
1966 
1967   /* Collect global wantallmatrix and nstages */
1968   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1969   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1970   if (!nmax) nmax = 1;
1971 
1972   if (scall == MAT_INITIAL_MATRIX) {
1973     /* Collect global wantallmatrix and nstages */
1974     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1975       PetscCall(ISIdentity(*isrow, &rowflag));
1976       PetscCall(ISIdentity(*iscol, &colflag));
1977       PetscCall(ISGetLocalSize(*isrow, &nrow));
1978       PetscCall(ISGetLocalSize(*iscol, &ncol));
1979       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1980         wantallmatrix = PETSC_TRUE;
1981 
1982         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1983       }
1984     }
1985 
1986     /* Determine the number of stages through which submatrices are done
1987        Each stage will extract nmax submatrices.
1988        nmax is determined by the matrix column dimension.
1989        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1990     */
1991     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1992 
1993     in[0] = -1 * (PetscInt)wantallmatrix;
1994     in[1] = nstages;
1995     PetscCallMPI(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1996     wantallmatrix = (PetscBool)(-out[0]);
1997     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1998 
1999   } else { /* MAT_REUSE_MATRIX */
2000     if (ismax) {
2001       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2002       smat = subc->submatis1;
2003     } else { /* (*submat)[0] is a dummy matrix */
2004       smat = (Mat_SubSppt *)(*submat)[0]->data;
2005     }
2006     if (!smat) {
2007       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2008       wantallmatrix = PETSC_TRUE;
2009     } else if (smat->singleis) {
2010       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2011       PetscFunctionReturn(PETSC_SUCCESS);
2012     } else {
2013       nstages = smat->nstages;
2014     }
2015   }
2016 
2017   if (wantallmatrix) {
2018     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2019     PetscFunctionReturn(PETSC_SUCCESS);
2020   }
2021 
2022   /* Allocate memory to hold all the submatrices and dummy submatrices */
2023   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2024 
2025   for (PetscInt i = 0, pos = 0; i < nstages; i++) {
2026     if (pos + nmax <= ismax) max_no = nmax;
2027     else if (pos >= ismax) max_no = 0;
2028     else max_no = ismax - pos;
2029 
2030     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos));
2031     if (!max_no) {
2032       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2033         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2034         smat->nstages = nstages;
2035       }
2036       pos++; /* advance to next dummy matrix if any */
2037     } else pos += max_no;
2038   }
2039 
2040   if (ismax && scall == MAT_INITIAL_MATRIX) {
2041     /* save nstages for reuse */
2042     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2043     smat          = subc->submatis1;
2044     smat->nstages = nstages;
2045   }
2046   PetscFunctionReturn(PETSC_SUCCESS);
2047 }
2048 
2049 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2050 {
2051   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2052   Mat              A = c->A;
2053   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2054   const PetscInt **icol, **irow;
2055   PetscInt        *nrow, *ncol, start;
2056   PetscMPIInt      nrqs = 0, *pa, proc = -1;
2057   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr, *req_source1 = NULL, *req_source2;
2058   PetscInt       **sbuf1, **sbuf2, k, ct1, ct2, **rbuf1, row;
2059   PetscInt         msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *tmp = NULL, tcol;
2060   PetscInt       **rbuf3 = NULL, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2061   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2062 #if defined(PETSC_USE_CTABLE)
2063   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2064 #else
2065   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2066 #endif
2067   const PetscInt *irow_i;
2068   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2069   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2070   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2071   MPI_Comm        comm;
2072   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2073   PetscMPIInt    *onodes1, *olengths1, end, **row2proc, *row2proc_i;
2074   PetscInt        ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2075   Mat_SubSppt    *smat_i;
2076   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2077   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen, jcnt;
2078 
2079   PetscFunctionBegin;
2080   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2081   size = c->size;
2082   rank = c->rank;
2083 
2084   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2085   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2086 
2087   for (PetscInt i = 0; i < ismax; i++) {
2088     PetscCall(ISSorted(iscol[i], &issorted[i]));
2089     if (!issorted[i]) iscsorted = issorted[i];
2090 
2091     PetscCall(ISSorted(isrow[i], &issorted[i]));
2092 
2093     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2094     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2095 
2096     /* Check for special case: allcolumn */
2097     PetscCall(ISIdentity(iscol[i], &colflag));
2098     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2099     if (colflag && ncol[i] == C->cmap->N) {
2100       allcolumns[i] = PETSC_TRUE;
2101       icol[i]       = NULL;
2102     } else {
2103       allcolumns[i] = PETSC_FALSE;
2104       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2105     }
2106   }
2107 
2108   if (scall == MAT_REUSE_MATRIX) {
2109     /* Assumes new rows are same length as the old rows */
2110     for (PetscInt i = 0; i < ismax; i++) {
2111       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2112       subc = (Mat_SeqAIJ *)submats[i]->data;
2113       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2114 
2115       /* Initial matrix as if empty */
2116       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2117 
2118       smat_i = subc->submatis1;
2119 
2120       nrqs        = smat_i->nrqs;
2121       nrqr        = smat_i->nrqr;
2122       rbuf1       = smat_i->rbuf1;
2123       rbuf2       = smat_i->rbuf2;
2124       rbuf3       = smat_i->rbuf3;
2125       req_source2 = smat_i->req_source2;
2126 
2127       sbuf1 = smat_i->sbuf1;
2128       sbuf2 = smat_i->sbuf2;
2129       ptr   = smat_i->ptr;
2130       tmp   = smat_i->tmp;
2131       ctr   = smat_i->ctr;
2132 
2133       pa          = smat_i->pa;
2134       req_size    = smat_i->req_size;
2135       req_source1 = smat_i->req_source1;
2136 
2137       allcolumns[i] = smat_i->allcolumns;
2138       row2proc[i]   = smat_i->row2proc;
2139       rmap[i]       = smat_i->rmap;
2140       cmap[i]       = smat_i->cmap;
2141     }
2142 
2143     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2144       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2145       smat_i = (Mat_SubSppt *)submats[0]->data;
2146 
2147       nrqs        = smat_i->nrqs;
2148       nrqr        = smat_i->nrqr;
2149       rbuf1       = smat_i->rbuf1;
2150       rbuf2       = smat_i->rbuf2;
2151       rbuf3       = smat_i->rbuf3;
2152       req_source2 = smat_i->req_source2;
2153 
2154       sbuf1 = smat_i->sbuf1;
2155       sbuf2 = smat_i->sbuf2;
2156       ptr   = smat_i->ptr;
2157       tmp   = smat_i->tmp;
2158       ctr   = smat_i->ctr;
2159 
2160       pa          = smat_i->pa;
2161       req_size    = smat_i->req_size;
2162       req_source1 = smat_i->req_source1;
2163 
2164       allcolumns[0] = PETSC_FALSE;
2165     }
2166   } else { /* scall == MAT_INITIAL_MATRIX */
2167     /* Get some new tags to keep the communication clean */
2168     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2169     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2170 
2171     /* evaluate communication - mesg to who, length of mesg, and buffer space
2172      required. Based on this, buffers are allocated, and data copied into them*/
2173     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2174 
2175     for (PetscInt i = 0; i < ismax; i++) {
2176       jmax   = nrow[i];
2177       irow_i = irow[i];
2178 
2179       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2180       row2proc[i] = row2proc_i;
2181 
2182       if (issorted[i]) proc = 0;
2183       for (PetscInt j = 0; j < jmax; j++) {
2184         if (!issorted[i]) proc = 0;
2185         row = irow_i[j];
2186         while (row >= C->rmap->range[proc + 1]) proc++;
2187         w4[proc]++;
2188         row2proc_i[j] = proc; /* map row index to proc */
2189       }
2190       for (PetscMPIInt j = 0; j < size; j++) {
2191         if (w4[j]) {
2192           w1[j] += w4[j];
2193           w3[j]++;
2194           w4[j] = 0;
2195         }
2196       }
2197     }
2198 
2199     nrqs     = 0; /* no of outgoing messages */
2200     msz      = 0; /* total mesg length (for all procs) */
2201     w1[rank] = 0; /* no mesg sent to self */
2202     w3[rank] = 0;
2203     for (PetscMPIInt i = 0; i < size; i++) {
2204       if (w1[i]) {
2205         w2[i] = 1;
2206         nrqs++;
2207       } /* there exists a message to proc i */
2208     }
2209     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2210     for (PetscMPIInt i = 0, j = 0; i < size; i++) {
2211       if (w1[i]) {
2212         pa[j] = i;
2213         j++;
2214       }
2215     }
2216 
2217     /* Each message would have a header = 1 + 2*(no of IS) + data */
2218     for (PetscMPIInt i = 0; i < nrqs; i++) {
2219       PetscMPIInt j = pa[i];
2220       w1[j] += w2[j] + 2 * w3[j];
2221       msz += w1[j];
2222     }
2223     PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));
2224 
2225     /* Determine the number of messages to expect, their lengths, from from-ids */
2226     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2227     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2228 
2229     /* Now post the Irecvs corresponding to these messages */
2230     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2231     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2232 
2233     /* Allocate Memory for outgoing messages */
2234     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2235     PetscCall(PetscArrayzero(sbuf1, size));
2236     PetscCall(PetscArrayzero(ptr, size));
2237 
2238     {
2239       PetscInt *iptr = tmp;
2240       k              = 0;
2241       for (PetscMPIInt i = 0; i < nrqs; i++) {
2242         PetscMPIInt j = pa[i];
2243         iptr += k;
2244         sbuf1[j] = iptr;
2245         k        = w1[j];
2246       }
2247     }
2248 
2249     /* Form the outgoing messages. Initialize the header space */
2250     for (PetscMPIInt i = 0; i < nrqs; i++) {
2251       PetscMPIInt j = pa[i];
2252       sbuf1[j][0]   = 0;
2253       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2254       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2255     }
2256 
2257     /* Parse the isrow and copy data into outbuf */
2258     for (PetscInt i = 0; i < ismax; i++) {
2259       row2proc_i = row2proc[i];
2260       PetscCall(PetscArrayzero(ctr, size));
2261       irow_i = irow[i];
2262       jmax   = nrow[i];
2263       for (PetscInt j = 0; j < jmax; j++) { /* parse the indices of each IS */
2264         proc = row2proc_i[j];
2265         if (proc != rank) { /* copy to the outgoing buf*/
2266           ctr[proc]++;
2267           *ptr[proc] = irow_i[j];
2268           ptr[proc]++;
2269         }
2270       }
2271       /* Update the headers for the current IS */
2272       for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
2273         if ((ctr_j = ctr[j])) {
2274           sbuf1_j            = sbuf1[j];
2275           k                  = ++sbuf1_j[0];
2276           sbuf1_j[2 * k]     = ctr_j;
2277           sbuf1_j[2 * k - 1] = i;
2278         }
2279       }
2280     }
2281 
2282     /*  Now  post the sends */
2283     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2284     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2285       PetscMPIInt j = pa[i];
2286       PetscCallMPI(MPIU_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2287     }
2288 
2289     /* Post Receives to capture the buffer size */
2290     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2291     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2292     if (nrqs) rbuf2[0] = tmp + msz;
2293     for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2294     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2295       PetscMPIInt j = pa[i];
2296       PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2297     }
2298 
2299     /* Send to other procs the buf size they should allocate */
2300     /* Receive messages*/
2301     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2302     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2303     {
2304       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2305       PetscInt *sbuf2_i;
2306 
2307       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2308       for (PetscMPIInt i = 0; i < nrqr; ++i) {
2309         req_size[i] = 0;
2310         rbuf1_i     = rbuf1[i];
2311         start       = 2 * rbuf1_i[0] + 1;
2312         end         = olengths1[i];
2313         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2314         sbuf2_i = sbuf2[i];
2315         for (PetscInt j = start; j < end; j++) {
2316           id         = rbuf1_i[j] - rstart;
2317           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2318           sbuf2_i[j] = ncols;
2319           req_size[i] += ncols;
2320         }
2321         req_source1[i] = onodes1[i];
2322         /* form the header */
2323         sbuf2_i[0] = req_size[i];
2324         for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2325 
2326         PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2327       }
2328     }
2329 
2330     PetscCall(PetscFree(onodes1));
2331     PetscCall(PetscFree(olengths1));
2332     PetscCall(PetscFree(r_waits1));
2333     PetscCall(PetscFree4(w1, w2, w3, w4));
2334 
2335     /* Receive messages*/
2336     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2337     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2338     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2339       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2340       req_source2[i] = pa[i];
2341       PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2342     }
2343     PetscCall(PetscFree(r_waits2));
2344 
2345     /* Wait on sends1 and sends2 */
2346     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2347     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2348     PetscCall(PetscFree(s_waits1));
2349     PetscCall(PetscFree(s_waits2));
2350 
2351     /* Now allocate sending buffers for a->j, and send them off */
2352     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2353     jcnt = 0;
2354     for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2355     if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
2356     for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2357 
2358     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2359     {
2360       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2361       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2362       PetscInt  cend = C->cmap->rend;
2363       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2364 
2365       for (PetscMPIInt i = 0; i < nrqr; i++) {
2366         rbuf1_i   = rbuf1[i];
2367         sbuf_aj_i = sbuf_aj[i];
2368         ct1       = 2 * rbuf1_i[0] + 1;
2369         ct2       = 0;
2370         for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2371           kmax = rbuf1[i][2 * j];
2372           for (PetscInt k = 0; k < kmax; k++, ct1++) {
2373             row    = rbuf1_i[ct1] - rstart;
2374             nzA    = a_i[row + 1] - a_i[row];
2375             nzB    = b_i[row + 1] - b_i[row];
2376             ncols  = nzA + nzB;
2377             cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]);
2378             cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2379 
2380             /* load the column indices for this row into cols */
2381             cols = sbuf_aj_i + ct2;
2382 
2383             lwrite = 0;
2384             for (PetscInt l = 0; l < nzB; l++) {
2385               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2386             }
2387             for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2388             for (PetscInt l = 0; l < nzB; l++) {
2389               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2390             }
2391 
2392             ct2 += ncols;
2393           }
2394         }
2395         PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2396       }
2397     }
2398 
2399     /* create col map: global col of C -> local col of submatrices */
2400     {
2401       const PetscInt *icol_i;
2402 #if defined(PETSC_USE_CTABLE)
2403       for (PetscInt i = 0; i < ismax; i++) {
2404         if (!allcolumns[i]) {
2405           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2406 
2407           jmax   = ncol[i];
2408           icol_i = icol[i];
2409           cmap_i = cmap[i];
2410           for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2411         } else cmap[i] = NULL;
2412       }
2413 #else
2414       for (PetscInt i = 0; i < ismax; i++) {
2415         if (!allcolumns[i]) {
2416           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2417           jmax   = ncol[i];
2418           icol_i = icol[i];
2419           cmap_i = cmap[i];
2420           for (PetscInt j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2421         } else cmap[i] = NULL;
2422       }
2423 #endif
2424     }
2425 
2426     /* Create lens which is required for MatCreate... */
2427     jcnt = 0;
2428     for (PetscInt i = 0; i < ismax; i++) jcnt += nrow[i];
2429     PetscCall(PetscMalloc1(ismax, &lens));
2430 
2431     if (ismax) PetscCall(PetscCalloc1(jcnt, &lens[0]));
2432     for (PetscInt i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]);
2433 
2434     /* Update lens from local data */
2435     for (PetscInt i = 0; i < ismax; i++) {
2436       row2proc_i = row2proc[i];
2437       jmax       = nrow[i];
2438       if (!allcolumns[i]) cmap_i = cmap[i];
2439       irow_i = irow[i];
2440       lens_i = lens[i];
2441       for (PetscInt j = 0; j < jmax; j++) {
2442         row  = irow_i[j];
2443         proc = row2proc_i[j];
2444         if (proc == rank) {
2445           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2446           if (!allcolumns[i]) {
2447             for (PetscInt k = 0; k < ncols; k++) {
2448 #if defined(PETSC_USE_CTABLE)
2449               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2450 #else
2451               tcol = cmap_i[cols[k]];
2452 #endif
2453               if (tcol) lens_i[j]++;
2454             }
2455           } else { /* allcolumns */
2456             lens_i[j] = ncols;
2457           }
2458           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2459         }
2460       }
2461     }
2462 
2463     /* Create row map: global row of C -> local row of submatrices */
2464 #if defined(PETSC_USE_CTABLE)
2465     for (PetscInt i = 0; i < ismax; i++) {
2466       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2467       irow_i = irow[i];
2468       jmax   = nrow[i];
2469       for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2470     }
2471 #else
2472     for (PetscInt i = 0; i < ismax; i++) {
2473       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2474       rmap_i = rmap[i];
2475       irow_i = irow[i];
2476       jmax   = nrow[i];
2477       for (PetscInt j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2478     }
2479 #endif
2480 
2481     /* Update lens from offproc data */
2482     {
2483       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2484 
2485       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2486       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2487         sbuf1_i = sbuf1[pa[tmp2]];
2488         jmax    = sbuf1_i[0];
2489         ct1     = 2 * jmax + 1;
2490         ct2     = 0;
2491         rbuf2_i = rbuf2[tmp2];
2492         rbuf3_i = rbuf3[tmp2];
2493         for (PetscInt j = 1; j <= jmax; j++) {
2494           is_no  = sbuf1_i[2 * j - 1];
2495           max1   = sbuf1_i[2 * j];
2496           lens_i = lens[is_no];
2497           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2498           rmap_i = rmap[is_no];
2499           for (PetscInt k = 0; k < max1; k++, ct1++) {
2500 #if defined(PETSC_USE_CTABLE)
2501             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2502             row--;
2503             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2504 #else
2505             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2506 #endif
2507             max2 = rbuf2_i[ct1];
2508             for (PetscInt l = 0; l < max2; l++, ct2++) {
2509               if (!allcolumns[is_no]) {
2510 #if defined(PETSC_USE_CTABLE)
2511                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2512 #else
2513                 tcol = cmap_i[rbuf3_i[ct2]];
2514 #endif
2515                 if (tcol) lens_i[row]++;
2516               } else {         /* allcolumns */
2517                 lens_i[row]++; /* lens_i[row] += max2 ? */
2518               }
2519             }
2520           }
2521         }
2522       }
2523     }
2524     PetscCall(PetscFree(r_waits3));
2525     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2526     PetscCall(PetscFree(s_waits3));
2527 
2528     /* Create the submatrices */
2529     for (PetscInt i = 0; i < ismax; i++) {
2530       PetscInt rbs, cbs;
2531 
2532       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2533       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2534 
2535       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2536       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2537 
2538       if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2539       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2540       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2541 
2542       /* create struct Mat_SubSppt and attached it to submat */
2543       PetscCall(PetscNew(&smat_i));
2544       subc            = (Mat_SeqAIJ *)submats[i]->data;
2545       subc->submatis1 = smat_i;
2546 
2547       smat_i->destroy          = submats[i]->ops->destroy;
2548       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2549       submats[i]->factortype   = C->factortype;
2550 
2551       smat_i->id          = i;
2552       smat_i->nrqs        = nrqs;
2553       smat_i->nrqr        = nrqr;
2554       smat_i->rbuf1       = rbuf1;
2555       smat_i->rbuf2       = rbuf2;
2556       smat_i->rbuf3       = rbuf3;
2557       smat_i->sbuf2       = sbuf2;
2558       smat_i->req_source2 = req_source2;
2559 
2560       smat_i->sbuf1 = sbuf1;
2561       smat_i->ptr   = ptr;
2562       smat_i->tmp   = tmp;
2563       smat_i->ctr   = ctr;
2564 
2565       smat_i->pa          = pa;
2566       smat_i->req_size    = req_size;
2567       smat_i->req_source1 = req_source1;
2568 
2569       smat_i->allcolumns = allcolumns[i];
2570       smat_i->singleis   = PETSC_FALSE;
2571       smat_i->row2proc   = row2proc[i];
2572       smat_i->rmap       = rmap[i];
2573       smat_i->cmap       = cmap[i];
2574     }
2575 
2576     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2577       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2578       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2579       PetscCall(MatSetType(submats[0], MATDUMMY));
2580 
2581       /* create struct Mat_SubSppt and attached it to submat */
2582       PetscCall(PetscNew(&smat_i));
2583       submats[0]->data = (void *)smat_i;
2584 
2585       smat_i->destroy          = submats[0]->ops->destroy;
2586       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2587       submats[0]->factortype   = C->factortype;
2588 
2589       smat_i->id          = 0;
2590       smat_i->nrqs        = nrqs;
2591       smat_i->nrqr        = nrqr;
2592       smat_i->rbuf1       = rbuf1;
2593       smat_i->rbuf2       = rbuf2;
2594       smat_i->rbuf3       = rbuf3;
2595       smat_i->sbuf2       = sbuf2;
2596       smat_i->req_source2 = req_source2;
2597 
2598       smat_i->sbuf1 = sbuf1;
2599       smat_i->ptr   = ptr;
2600       smat_i->tmp   = tmp;
2601       smat_i->ctr   = ctr;
2602 
2603       smat_i->pa          = pa;
2604       smat_i->req_size    = req_size;
2605       smat_i->req_source1 = req_source1;
2606 
2607       smat_i->allcolumns = PETSC_FALSE;
2608       smat_i->singleis   = PETSC_FALSE;
2609       smat_i->row2proc   = NULL;
2610       smat_i->rmap       = NULL;
2611       smat_i->cmap       = NULL;
2612     }
2613 
2614     if (ismax) PetscCall(PetscFree(lens[0]));
2615     PetscCall(PetscFree(lens));
2616     if (sbuf_aj) {
2617       PetscCall(PetscFree(sbuf_aj[0]));
2618       PetscCall(PetscFree(sbuf_aj));
2619     }
2620 
2621   } /* endof scall == MAT_INITIAL_MATRIX */
2622 
2623   /* Post recv matrix values */
2624   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2625   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2626   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2627   for (PetscMPIInt i = 0; i < nrqs; ++i) {
2628     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2629     PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2630   }
2631 
2632   /* Allocate sending buffers for a->a, and send them off */
2633   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2634   jcnt = 0;
2635   for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2636   if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
2637   for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2638 
2639   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2640   {
2641     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2642     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2643     PetscInt     cend = C->cmap->rend;
2644     PetscInt    *b_j  = b->j;
2645     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2646 
2647     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2648     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2649     for (PetscMPIInt i = 0; i < nrqr; i++) {
2650       rbuf1_i   = rbuf1[i];
2651       sbuf_aa_i = sbuf_aa[i];
2652       ct1       = 2 * rbuf1_i[0] + 1;
2653       ct2       = 0;
2654       for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2655         kmax = rbuf1_i[2 * j];
2656         for (PetscInt k = 0; k < kmax; k++, ct1++) {
2657           row    = rbuf1_i[ct1] - rstart;
2658           nzA    = a_i[row + 1] - a_i[row];
2659           nzB    = b_i[row + 1] - b_i[row];
2660           ncols  = nzA + nzB;
2661           cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2662           vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]);
2663           vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]);
2664 
2665           /* load the column values for this row into vals*/
2666           vals = sbuf_aa_i + ct2;
2667 
2668           lwrite = 0;
2669           for (PetscInt l = 0; l < nzB; l++) {
2670             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2671           }
2672           for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2673           for (PetscInt l = 0; l < nzB; l++) {
2674             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2675           }
2676 
2677           ct2 += ncols;
2678         }
2679       }
2680       PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2681     }
2682     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2683     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2684   }
2685 
2686   /* Assemble the matrices */
2687   /* First assemble the local rows */
2688   for (PetscInt i = 0; i < ismax; i++) {
2689     row2proc_i = row2proc[i];
2690     subc       = (Mat_SeqAIJ *)submats[i]->data;
2691     imat_ilen  = subc->ilen;
2692     imat_j     = subc->j;
2693     imat_i     = subc->i;
2694     imat_a     = subc->a;
2695 
2696     if (!allcolumns[i]) cmap_i = cmap[i];
2697     rmap_i = rmap[i];
2698     irow_i = irow[i];
2699     jmax   = nrow[i];
2700     for (PetscInt j = 0; j < jmax; j++) {
2701       row  = irow_i[j];
2702       proc = row2proc_i[j];
2703       if (proc == rank) {
2704         old_row = row;
2705 #if defined(PETSC_USE_CTABLE)
2706         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2707         row--;
2708 #else
2709         row = rmap_i[row];
2710 #endif
2711         ilen_row = imat_ilen[row];
2712         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2713         mat_i = imat_i[row];
2714         mat_a = imat_a + mat_i;
2715         mat_j = imat_j + mat_i;
2716         if (!allcolumns[i]) {
2717           for (PetscInt k = 0; k < ncols; k++) {
2718 #if defined(PETSC_USE_CTABLE)
2719             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2720 #else
2721             tcol = cmap_i[cols[k]];
2722 #endif
2723             if (tcol) {
2724               *mat_j++ = tcol - 1;
2725               *mat_a++ = vals[k];
2726               ilen_row++;
2727             }
2728           }
2729         } else { /* allcolumns */
2730           for (PetscInt k = 0; k < ncols; k++) {
2731             *mat_j++ = cols[k]; /* global col index! */
2732             *mat_a++ = vals[k];
2733             ilen_row++;
2734           }
2735         }
2736         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2737 
2738         imat_ilen[row] = ilen_row;
2739       }
2740     }
2741   }
2742 
2743   /* Now assemble the off proc rows */
2744   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2745   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2746     sbuf1_i = sbuf1[pa[tmp2]];
2747     jmax    = sbuf1_i[0];
2748     ct1     = 2 * jmax + 1;
2749     ct2     = 0;
2750     rbuf2_i = rbuf2[tmp2];
2751     rbuf3_i = rbuf3[tmp2];
2752     rbuf4_i = rbuf4[tmp2];
2753     for (PetscInt j = 1; j <= jmax; j++) {
2754       is_no  = sbuf1_i[2 * j - 1];
2755       rmap_i = rmap[is_no];
2756       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2757       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2758       imat_ilen = subc->ilen;
2759       imat_j    = subc->j;
2760       imat_i    = subc->i;
2761       imat_a    = subc->a;
2762       max1      = sbuf1_i[2 * j];
2763       for (PetscInt k = 0; k < max1; k++, ct1++) {
2764         row = sbuf1_i[ct1];
2765 #if defined(PETSC_USE_CTABLE)
2766         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2767         row--;
2768 #else
2769         row = rmap_i[row];
2770 #endif
2771         ilen  = imat_ilen[row];
2772         mat_i = imat_i[row];
2773         mat_a = PetscSafePointerPlusOffset(imat_a, mat_i);
2774         mat_j = PetscSafePointerPlusOffset(imat_j, mat_i);
2775         max2  = rbuf2_i[ct1];
2776         if (!allcolumns[is_no]) {
2777           for (PetscInt l = 0; l < max2; l++, ct2++) {
2778 #if defined(PETSC_USE_CTABLE)
2779             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2780 #else
2781             tcol = cmap_i[rbuf3_i[ct2]];
2782 #endif
2783             if (tcol) {
2784               *mat_j++ = tcol - 1;
2785               *mat_a++ = rbuf4_i[ct2];
2786               ilen++;
2787             }
2788           }
2789         } else { /* allcolumns */
2790           for (PetscInt l = 0; l < max2; l++, ct2++) {
2791             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2792             *mat_a++ = rbuf4_i[ct2];
2793             ilen++;
2794           }
2795         }
2796         imat_ilen[row] = ilen;
2797       }
2798     }
2799   }
2800 
2801   if (!iscsorted) { /* sort column indices of the rows */
2802     for (PetscInt i = 0; i < ismax; i++) {
2803       subc      = (Mat_SeqAIJ *)submats[i]->data;
2804       imat_j    = subc->j;
2805       imat_i    = subc->i;
2806       imat_a    = subc->a;
2807       imat_ilen = subc->ilen;
2808 
2809       if (allcolumns[i]) continue;
2810       jmax = nrow[i];
2811       for (PetscInt j = 0; j < jmax; j++) {
2812         mat_i = imat_i[j];
2813         mat_a = imat_a + mat_i;
2814         mat_j = imat_j + mat_i;
2815         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2816       }
2817     }
2818   }
2819 
2820   PetscCall(PetscFree(r_waits4));
2821   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2822   PetscCall(PetscFree(s_waits4));
2823 
2824   /* Restore the indices */
2825   for (PetscInt i = 0; i < ismax; i++) {
2826     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2827     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2828   }
2829 
2830   for (PetscInt i = 0; i < ismax; i++) {
2831     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2832     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2833   }
2834 
2835   /* Destroy allocated memory */
2836   if (sbuf_aa) {
2837     PetscCall(PetscFree(sbuf_aa[0]));
2838     PetscCall(PetscFree(sbuf_aa));
2839   }
2840   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2841 
2842   for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2843   PetscCall(PetscFree(rbuf4));
2844 
2845   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2846   PetscFunctionReturn(PETSC_SUCCESS);
2847 }
2848 
2849 /*
2850  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2851  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2852  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2853  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2854  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2855  state, and needs to be "assembled" later by compressing B's column space.
2856 
2857  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2858  Following this call, C->A & C->B have been created, even if empty.
2859  */
2860 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2861 {
2862   /* If making this function public, change the error returned in this function away from _PLIB. */
2863   Mat_MPIAIJ     *aij;
2864   Mat_SeqAIJ     *Baij;
2865   PetscBool       seqaij, Bdisassembled;
2866   PetscInt        m, n, *nz, ngcol, col, rstart, rend, shift, count;
2867   PetscScalar     v;
2868   const PetscInt *rowindices, *colindices;
2869 
2870   PetscFunctionBegin;
2871   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2872   if (A) {
2873     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2874     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2875     if (rowemb) {
2876       PetscCall(ISGetLocalSize(rowemb, &m));
2877       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2878     } else {
2879       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2880     }
2881     if (dcolemb) {
2882       PetscCall(ISGetLocalSize(dcolemb, &n));
2883       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2884     } else {
2885       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2886     }
2887   }
2888   if (B) {
2889     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2890     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2891     if (rowemb) {
2892       PetscCall(ISGetLocalSize(rowemb, &m));
2893       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2894     } else {
2895       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2896     }
2897     if (ocolemb) {
2898       PetscCall(ISGetLocalSize(ocolemb, &n));
2899       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2900     } else {
2901       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2902     }
2903   }
2904 
2905   aij = (Mat_MPIAIJ *)C->data;
2906   if (!aij->A) {
2907     /* Mimic parts of MatMPIAIJSetPreallocation() */
2908     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2909     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2910     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2911     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2912   }
2913   if (A) {
2914     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2915   } else {
2916     PetscCall(MatSetUp(aij->A));
2917   }
2918   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2919     /*
2920       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2921       need to "disassemble" B -- convert it to using C's global indices.
2922       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2923 
2924       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2925       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2926 
2927       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2928       At least avoid calling MatSetValues() and the implied searches?
2929     */
2930 
2931     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2932 #if defined(PETSC_USE_CTABLE)
2933       PetscCall(PetscHMapIDestroy(&aij->colmap));
2934 #else
2935       PetscCall(PetscFree(aij->colmap));
2936       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2937 #endif
2938       ngcol = 0;
2939       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2940       if (aij->garray) PetscCall(PetscFree(aij->garray));
2941       PetscCall(VecDestroy(&aij->lvec));
2942       PetscCall(VecScatterDestroy(&aij->Mvctx));
2943     }
2944     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2945     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2946   }
2947   Bdisassembled = PETSC_FALSE;
2948   if (!aij->B) {
2949     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2950     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2951     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2952     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2953     Bdisassembled = PETSC_TRUE;
2954   }
2955   if (B) {
2956     Baij = (Mat_SeqAIJ *)B->data;
2957     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2958       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2959       for (PetscInt i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2960       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2961       PetscCall(PetscFree(nz));
2962     }
2963 
2964     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2965     shift      = rend - rstart;
2966     count      = 0;
2967     rowindices = NULL;
2968     colindices = NULL;
2969     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2970     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2971     for (PetscInt i = 0; i < B->rmap->n; i++) {
2972       PetscInt row;
2973       row = i;
2974       if (rowindices) row = rowindices[i];
2975       for (PetscInt j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2976         col = Baij->j[count];
2977         if (colindices) col = colindices[col];
2978         if (Bdisassembled && col >= rstart) col += shift;
2979         v = Baij->a[count];
2980         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2981         ++count;
2982       }
2983     }
2984     /* No assembly for aij->B is necessary. */
2985     /* FIXME: set aij->B's nonzerostate correctly. */
2986   } else {
2987     PetscCall(MatSetUp(aij->B));
2988   }
2989   C->preallocated  = PETSC_TRUE;
2990   C->was_assembled = PETSC_FALSE;
2991   C->assembled     = PETSC_FALSE;
2992   /*
2993       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2994       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2995    */
2996   PetscFunctionReturn(PETSC_SUCCESS);
2997 }
2998 
2999 /*
3000   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3001  */
3002 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
3003 {
3004   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
3005 
3006   PetscFunctionBegin;
3007   PetscAssertPointer(A, 2);
3008   PetscAssertPointer(B, 3);
3009   /* FIXME: make sure C is assembled */
3010   *A = aij->A;
3011   *B = aij->B;
3012   /* Note that we don't incref *A and *B, so be careful! */
3013   PetscFunctionReturn(PETSC_SUCCESS);
3014 }
3015 
3016 /*
3017   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3018   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3019 */
3020 static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3021 {
3022   PetscMPIInt size, flag;
3023   PetscInt    cismax, ispar;
3024   Mat        *A, *B;
3025   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3026 
3027   PetscFunctionBegin;
3028   if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);
3029 
3030   cismax = 0;
3031   for (PetscInt i = 0; i < ismax; ++i) {
3032     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3033     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3034     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3035     if (size > 1) ++cismax;
3036   }
3037 
3038   /*
3039      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3040      ispar counts the number of parallel ISs across C's comm.
3041   */
3042   PetscCallMPI(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3043   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3044     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3045     PetscFunctionReturn(PETSC_SUCCESS);
3046   }
3047 
3048   /* if (ispar) */
3049   /*
3050     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3051     These are used to extract the off-diag portion of the resulting parallel matrix.
3052     The row IS for the off-diag portion is the same as for the diag portion,
3053     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3054   */
3055   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3056   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3057   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3058     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3059     if (size > 1) {
3060       /*
3061          TODO: This is the part that's ***NOT SCALABLE***.
3062          To fix this we need to extract just the indices of C's nonzero columns
3063          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3064          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3065          to be done without serializing on the IS list, so, most likely, it is best
3066          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3067       */
3068       PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii]));
3069       /* Now we have to
3070          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3071              were sorted on each rank, concatenated they might no longer be sorted;
3072          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3073              indices in the nondecreasing order to the original index positions.
3074          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3075       */
3076       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3077       PetscCall(ISSort(ciscol[ii]));
3078       ++ii;
3079     }
3080   }
3081   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3082   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3083     PetscInt        issize;
3084     const PetscInt *indices;
3085 
3086     /*
3087        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3088      */
3089     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3090     PetscCall(ISSort(isrow[i]));
3091     PetscCall(ISGetLocalSize(isrow[i], &issize));
3092     PetscCall(ISGetIndices(isrow[i], &indices));
3093     for (PetscInt j = 1; j < issize; ++j) {
3094       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3095     }
3096     PetscCall(ISRestoreIndices(isrow[i], &indices));
3097     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3098     PetscCall(ISSort(iscol[i]));
3099     PetscCall(ISGetLocalSize(iscol[i], &issize));
3100     PetscCall(ISGetIndices(iscol[i], &indices));
3101     for (PetscInt j = 1; j < issize; ++j) {
3102       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3103     }
3104     PetscCall(ISRestoreIndices(iscol[i], &indices));
3105     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3106     if (size > 1) {
3107       cisrow[ii] = isrow[i];
3108       ++ii;
3109     }
3110   }
3111   /*
3112     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3113     array of sequential matrices underlying the resulting parallel matrices.
3114     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3115     contain duplicates.
3116 
3117     There are as many diag matrices as there are original index sets. There are only as many parallel
3118     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3119 
3120     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3121     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3122       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3123       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3124       with A[i] and B[ii] extracted from the corresponding MPI submat.
3125     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3126       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3127       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3128       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3129       values into A[i] and B[ii] sitting inside the corresponding submat.
3130     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3131       A[i], B[ii], AA[i] or BB[ii] matrices.
3132   */
3133   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3134   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3135 
3136   /* Now obtain the sequential A and B submatrices separately. */
3137   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3138   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3139   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3140 
3141   /*
3142     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3143     matrices A & B have been extracted directly into the parallel matrices containing them, or
3144     simply into the sequential matrix identical with the corresponding A (if size == 1).
3145     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3146     to have the same sparsity pattern.
3147     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3148     must be constructed for C. This is done by setseqmat(s).
3149   */
3150   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3151     /*
3152        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3153        That way we can avoid sorting and computing permutations when reusing.
3154        To this end:
3155         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3156         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3157           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3158     */
3159     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3160 
3161     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3162     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3163     if (size > 1) {
3164       if (scall == MAT_INITIAL_MATRIX) {
3165         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3166         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3167         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3168         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3169         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3170       }
3171       /*
3172         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3173       */
3174       {
3175         Mat AA = A[i], BB = B[ii];
3176 
3177         if (AA || BB) {
3178           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3179           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3180           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3181         }
3182         PetscCall(MatDestroy(&AA));
3183       }
3184       PetscCall(ISDestroy(ciscol + ii));
3185       PetscCall(ISDestroy(ciscol_p + ii));
3186       ++ii;
3187     } else { /* if (size == 1) */
3188       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3189       if (isrow_p[i] || iscol_p[i]) {
3190         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3191         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3192         /* Otherwise A is extracted straight into (*submats)[i]. */
3193         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3194         PetscCall(MatDestroy(A + i));
3195       } else (*submat)[i] = A[i];
3196     }
3197     PetscCall(ISDestroy(&isrow_p[i]));
3198     PetscCall(ISDestroy(&iscol_p[i]));
3199   }
3200   PetscCall(PetscFree2(cisrow, ciscol));
3201   PetscCall(PetscFree2(isrow_p, iscol_p));
3202   PetscCall(PetscFree(ciscol_p));
3203   PetscCall(PetscFree(A));
3204   PetscCall(MatDestroySubMatrices(cismax, &B));
3205   PetscFunctionReturn(PETSC_SUCCESS);
3206 }
3207 
3208 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3209 {
3210   PetscFunctionBegin;
3211   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3212   PetscFunctionReturn(PETSC_SUCCESS);
3213 }
3214