xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 73fdd05bb67e49f40fd8fd311695ff6fdf0b9b8a)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 /*
22    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
23 
24    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25    that is a very major recoding job.
26 
27    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28 */
29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30 {
31   PetscFunctionBegin;
32   for (PetscInt i = 0; i < imax; i++) {
33     if (!is[i]) break;
34     PetscInt        n = 0, N, Nmax, Nmin;
35     const PetscInt *idx;
36     PetscInt       *nidx = NULL;
37     MPI_Comm        comm;
38     PetscBT         bt;
39 
40     PetscCall(ISGetLocalSize(is[i], &N));
41     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42       PetscCall(ISGetIndices(is[i], &idx));
43       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44       Nmin = Nmin / bs;
45       Nmax = Nmax / bs;
46       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47       for (PetscInt j = 0; j < N; j++) {
48         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49       }
50       PetscCall(PetscMalloc1(n, &nidx));
51       n = 0;
52       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53       for (PetscInt j = 0; j < N; j++) {
54         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55       }
56       PetscCall(PetscBTDestroy(&bt));
57       PetscCall(ISRestoreIndices(is[i], &idx));
58     }
59     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60     PetscCall(ISDestroy(is + i));
61     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67 {
68   PetscInt i;
69 
70   PetscFunctionBegin;
71   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74   PetscFunctionReturn(PETSC_SUCCESS);
75 }
76 
77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78 {
79   PetscInt i;
80 
81   PetscFunctionBegin;
82   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89 {
90   MPI_Comm        comm;
91   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
92   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94   const PetscInt *indices_i, **indices;
95   PetscLayout     rmap;
96   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
97   PetscSF         sf;
98   PetscSFNode    *remote;
99 
100   PetscFunctionBegin;
101   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103   PetscCallMPI(MPI_Comm_size(comm, &size));
104   /* get row map to determine where rows should be going */
105   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106   /* retrieve IS data and put all together so that we
107    * can optimize communication
108    *  */
109   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110   for (i = 0, tlength = 0; i < nidx; i++) {
111     PetscCall(ISGetLocalSize(is[i], &length[i]));
112     tlength += length[i];
113     PetscCall(ISGetIndices(is[i], &indices[i]));
114   }
115   /* find these rows on remote processors */
116   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
117   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
118   nrrows = 0;
119   for (i = 0; i < nidx; i++) {
120     length_i  = length[i];
121     indices_i = indices[i];
122     for (j = 0; j < length_i; j++) {
123       owner = -1;
124       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
125       /* remote processors */
126       if (owner != rank) {
127         tosizes_temp[owner]++;               /* number of rows to owner */
128         rrow_ranks[nrrows]   = owner;        /* processor */
129         rrow_isids[nrrows]   = i;            /* is id */
130         remoterows[nrrows++] = indices_i[j]; /* row */
131       }
132     }
133     PetscCall(ISRestoreIndices(is[i], &indices[i]));
134   }
135   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
136   /* test if we need to exchange messages
137    * generally speaking, we do not need to exchange
138    * data when overlap is 1
139    * */
140   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
141   /* we do not have any messages
142    * It usually corresponds to overlap 1
143    * */
144   if (!reducednrrows) {
145     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
146     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
147     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
148     PetscFunctionReturn(PETSC_SUCCESS);
149   }
150   nto = 0;
151   /* send sizes and ranks for building a two-sided communication */
152   for (i = 0; i < size; i++) {
153     if (tosizes_temp[i]) {
154       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
155       tosizes_temp[i]  = nto;                 /* a map from processor to index */
156       toranks[nto++]   = i;                   /* processor */
157     }
158   }
159   PetscCall(PetscMalloc1(nto + 1, &toffsets));
160   toffsets[0] = 0;
161   for (i = 0; i < nto; i++) {
162     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
163     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
164   }
165   /* send information to other processors */
166   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
167   nrecvrows = 0;
168   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
169   PetscCall(PetscMalloc1(nrecvrows, &remote));
170   nrecvrows = 0;
171   for (i = 0; i < nfrom; i++) {
172     for (j = 0; j < fromsizes[2 * i]; j++) {
173       remote[nrecvrows].rank    = fromranks[i];
174       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
175     }
176   }
177   PetscCall(PetscSFCreate(comm, &sf));
178   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
179   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
180   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
181   PetscCall(PetscSFSetFromOptions(sf));
182   /* message pair <no of is, row>  */
183   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
184   for (i = 0; i < nrrows; i++) {
185     owner                 = rrow_ranks[i];       /* processor */
186     j                     = tosizes_temp[owner]; /* index */
187     todata[toffsets[j]++] = rrow_isids[i];
188     todata[toffsets[j]++] = remoterows[i];
189   }
190   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
191   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
192   PetscCall(PetscFree(toffsets));
193   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
194   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195   PetscCall(PetscSFDestroy(&sf));
196   /* send rows belonging to the remote so that then we could get the overlapping data back */
197   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
198   PetscCall(PetscFree2(todata, fromdata));
199   PetscCall(PetscFree(fromsizes));
200   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
201   PetscCall(PetscFree(fromranks));
202   nrecvrows = 0;
203   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
204   PetscCall(PetscCalloc1(nrecvrows, &todata));
205   PetscCall(PetscMalloc1(nrecvrows, &remote));
206   nrecvrows = 0;
207   for (i = 0; i < nto; i++) {
208     for (j = 0; j < tosizes[2 * i]; j++) {
209       remote[nrecvrows].rank    = toranks[i];
210       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
211     }
212   }
213   PetscCall(PetscSFCreate(comm, &sf));
214   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
215   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
216   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
217   PetscCall(PetscSFSetFromOptions(sf));
218   /* overlap communication and computation */
219   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
220   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
221   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
222   PetscCall(PetscSFDestroy(&sf));
223   PetscCall(PetscFree2(sbdata, sbsizes));
224   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
225   PetscCall(PetscFree(toranks));
226   PetscCall(PetscFree(tosizes));
227   PetscCall(PetscFree(todata));
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
232 {
233   PetscInt       *isz, isz_i, i, j, is_id, data_size;
234   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
235   const PetscInt *indices_i_temp;
236   MPI_Comm       *iscomms;
237 
238   PetscFunctionBegin;
239   max_lsize = 0;
240   PetscCall(PetscMalloc1(nidx, &isz));
241   for (i = 0; i < nidx; i++) {
242     PetscCall(ISGetLocalSize(is[i], &lsize));
243     max_lsize = lsize > max_lsize ? lsize : max_lsize;
244     isz[i]    = lsize;
245   }
246   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
247   for (i = 0; i < nidx; i++) {
248     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
249     PetscCall(ISGetIndices(is[i], &indices_i_temp));
250     PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]));
251     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
252     PetscCall(ISDestroy(&is[i]));
253   }
254   /* retrieve information to get row id and its overlap */
255   for (i = 0; i < nrecvs;) {
256     is_id     = recvdata[i++];
257     data_size = recvdata[i++];
258     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
259     isz_i     = isz[is_id];
260     for (j = 0; j < data_size; j++) {
261       col                = recvdata[i++];
262       indices_i[isz_i++] = col;
263     }
264     isz[is_id] = isz_i;
265   }
266   /* remove duplicate entities */
267   for (i = 0; i < nidx; i++) {
268     indices_i = indices_temp + (max_lsize + nrecvs) * i;
269     isz_i     = isz[i];
270     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
271     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
272     PetscCall(PetscCommDestroy(&iscomms[i]));
273   }
274   PetscCall(PetscFree(isz));
275   PetscCall(PetscFree2(indices_temp, iscomms));
276   PetscFunctionReturn(PETSC_SUCCESS);
277 }
278 
279 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
280 {
281   PetscLayout     rmap, cmap;
282   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
283   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
284   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
285   const PetscInt *gcols, *ai, *aj, *bi, *bj;
286   Mat             amat, bmat;
287   PetscMPIInt     rank;
288   PetscBool       done;
289   MPI_Comm        comm;
290 
291   PetscFunctionBegin;
292   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
293   PetscCallMPI(MPI_Comm_rank(comm, &rank));
294   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
295   /* Even if the mat is symmetric, we still assume it is not symmetric */
296   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
297   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
298   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
299   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
300   /* total number of nonzero values is used to estimate the memory usage in the next step */
301   tnz = ai[an] + bi[bn];
302   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
303   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
304   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
305   /* to find the longest message */
306   max_fszs = 0;
307   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
308   /* better way to estimate number of nonzero in the mat??? */
309   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
310   for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
311   rows_pos  = 0;
312   totalrows = 0;
313   for (i = 0; i < nfrom; i++) {
314     PetscCall(PetscArrayzero(rows_pos_i, nidx));
315     /* group data together */
316     for (j = 0; j < fromsizes[2 * i]; j += 2) {
317       is_id                       = fromrows[rows_pos++]; /* no of is */
318       rows_i                      = rows_data[is_id];
319       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
320     }
321     /* estimate a space to avoid multiple allocations  */
322     for (j = 0; j < nidx; j++) {
323       indvc_ij = 0;
324       rows_i   = rows_data[j];
325       for (l = 0; l < rows_pos_i[j]; l++) {
326         row   = rows_i[l] - rstart;
327         start = ai[row];
328         end   = ai[row + 1];
329         for (k = start; k < end; k++) { /* Amat */
330           col                     = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
332         }
333         start = bi[row];
334         end   = bi[row + 1];
335         for (k = start; k < end; k++) { /* Bmat */
336           col                     = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
341       indv_counts[i * nidx + j] = indvc_ij;
342       totalrows += indvc_ij;
343     }
344   }
345   /* message triple <no of is, number of rows, rows> */
346   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
347   totalrows = 0;
348   rows_pos  = 0;
349   /* use this code again */
350   for (i = 0; i < nfrom; i++) {
351     PetscCall(PetscArrayzero(rows_pos_i, nidx));
352     for (j = 0; j < fromsizes[2 * i]; j += 2) {
353       is_id                       = fromrows[rows_pos++];
354       rows_i                      = rows_data[is_id];
355       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
356     }
357     /* add data  */
358     for (j = 0; j < nidx; j++) {
359       if (!indv_counts[i * nidx + j]) continue;
360       indvc_ij            = 0;
361       sbdata[totalrows++] = j;
362       sbdata[totalrows++] = indv_counts[i * nidx + j];
363       sbsizes[2 * i] += 2;
364       rows_i = rows_data[j];
365       for (l = 0; l < rows_pos_i[j]; l++) {
366         row   = rows_i[l] - rstart;
367         start = ai[row];
368         end   = ai[row + 1];
369         for (k = start; k < end; k++) { /* Amat */
370           col                     = aj[k] + cstart;
371           indices_tmp[indvc_ij++] = col;
372         }
373         start = bi[row];
374         end   = bi[row + 1];
375         for (k = start; k < end; k++) { /* Bmat */
376           col                     = gcols[bj[k]];
377           indices_tmp[indvc_ij++] = col;
378         }
379       }
380       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
381       sbsizes[2 * i] += indvc_ij;
382       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
383       totalrows += indvc_ij;
384     }
385   }
386   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
387   offsets[0] = 0;
388   for (i = 0; i < nfrom; i++) {
389     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
390     sbsizes[2 * i + 1] = offsets[i];
391   }
392   PetscCall(PetscFree(offsets));
393   if (sbrowsizes) *sbrowsizes = sbsizes;
394   if (sbrows) *sbrows = sbdata;
395   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
396   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
397   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
398   PetscFunctionReturn(PETSC_SUCCESS);
399 }
400 
401 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
402 {
403   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
404   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
405   PetscInt        lsize, lsize_tmp;
406   PetscMPIInt     rank, owner;
407   Mat             amat, bmat;
408   PetscBool       done;
409   PetscLayout     cmap, rmap;
410   MPI_Comm        comm;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
414   PetscCallMPI(MPI_Comm_rank(comm, &rank));
415   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
416   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
418   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
419   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
420   /* is it a safe way to compute number of nonzero values ? */
421   tnz = ai[an] + bi[bn];
422   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
423   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
424   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
425   /* it is a better way to estimate memory than the old implementation
426    * where global size of matrix is used
427    * */
428   PetscCall(PetscMalloc1(tnz, &indices_temp));
429   for (i = 0; i < nidx; i++) {
430     MPI_Comm iscomm;
431 
432     PetscCall(ISGetLocalSize(is[i], &lsize));
433     PetscCall(ISGetIndices(is[i], &indices));
434     lsize_tmp = 0;
435     for (j = 0; j < lsize; j++) {
436       owner = -1;
437       row   = indices[j];
438       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
439       if (owner != rank) continue;
440       /* local number */
441       row -= rstart;
442       start = ai[row];
443       end   = ai[row + 1];
444       for (k = start; k < end; k++) { /* Amat */
445         col                       = aj[k] + cstart;
446         indices_temp[lsize_tmp++] = col;
447       }
448       start = bi[row];
449       end   = bi[row + 1];
450       for (k = start; k < end; k++) { /* Bmat */
451         col                       = gcols[bj[k]];
452         indices_temp[lsize_tmp++] = col;
453       }
454     }
455     PetscCall(ISRestoreIndices(is[i], &indices));
456     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
457     PetscCall(ISDestroy(&is[i]));
458     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
459     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
460     PetscCall(PetscCommDestroy(&iscomm));
461   }
462   PetscCall(PetscFree(indices_temp));
463   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
464   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
465   PetscFunctionReturn(PETSC_SUCCESS);
466 }
467 
468 /*
469   Sample message format:
470   If a processor A wants processor B to process some elements corresponding
471   to index sets is[1],is[5]
472   mesg [0] = 2   (no of index sets in the mesg)
473   -----------
474   mesg [1] = 1 => is[1]
475   mesg [2] = sizeof(is[1]);
476   -----------
477   mesg [3] = 5  => is[5]
478   mesg [4] = sizeof(is[5]);
479   -----------
480   mesg [5]
481   mesg [n]  datas[1]
482   -----------
483   mesg[n+1]
484   mesg[m]  data(is[5])
485   -----------
486 
487   Notes:
488   nrqs - no of requests sent (or to be sent out)
489   nrqr - no of requests received (which have to be or which have been processed)
490 */
491 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
492 {
493   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
494   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
495   const PetscInt **idx, *idx_i;
496   PetscInt        *n, **data, len;
497 #if defined(PETSC_USE_CTABLE)
498   PetscHMapI *table_data, table_data_i;
499   PetscInt   *tdata, tcount, tcount_max;
500 #else
501   PetscInt *data_i, *d_p;
502 #endif
503   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
504   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
505   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
506   PetscBT     *table;
507   MPI_Comm     comm;
508   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
509   MPI_Status  *recv_status;
510   MPI_Comm    *iscomms;
511   char        *t_p;
512 
513   PetscFunctionBegin;
514   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
515   size = c->size;
516   rank = c->rank;
517   M    = C->rmap->N;
518 
519   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
520   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
521 
522   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
523 
524   for (i = 0; i < imax; i++) {
525     PetscCall(ISGetIndices(is[i], &idx[i]));
526     PetscCall(ISGetLocalSize(is[i], &n[i]));
527   }
528 
529   /* evaluate communication - mesg to who,length of mesg, and buffer space
530      required. Based on this, buffers are allocated, and data copied into them  */
531   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
532   for (i = 0; i < imax; i++) {
533     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
534     idx_i = idx[i];
535     len   = n[i];
536     for (j = 0; j < len; j++) {
537       row = idx_i[j];
538       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
539       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
540       w4[proc]++;
541     }
542     for (j = 0; j < size; j++) {
543       if (w4[j]) {
544         w1[j] += w4[j];
545         w3[j]++;
546       }
547     }
548   }
549 
550   nrqs     = 0; /* no of outgoing messages */
551   msz      = 0; /* total mesg length (for all proc */
552   w1[rank] = 0; /* no mesg sent to intself */
553   w3[rank] = 0;
554   for (i = 0; i < size; i++) {
555     if (w1[i]) {
556       w2[i] = 1;
557       nrqs++;
558     } /* there exists a message to proc i */
559   }
560   /* pa - is list of processors to communicate with */
561   PetscCall(PetscMalloc1(nrqs, &pa));
562   for (i = 0, j = 0; i < size; i++) {
563     if (w1[i]) {
564       pa[j] = i;
565       j++;
566     }
567   }
568 
569   /* Each message would have a header = 1 + 2*(no of IS) + data */
570   for (i = 0; i < nrqs; i++) {
571     j = pa[i];
572     w1[j] += w2[j] + 2 * w3[j];
573     msz += w1[j];
574   }
575 
576   /* Determine the number of messages to expect, their lengths, from from-ids */
577   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
578   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
579 
580   /* Now post the Irecvs corresponding to these messages */
581   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
582 
583   /* Allocate Memory for outgoing messages */
584   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
585   PetscCall(PetscArrayzero(outdat, size));
586   PetscCall(PetscArrayzero(ptr, size));
587 
588   {
589     PetscInt *iptr = tmp, ict = 0;
590     for (i = 0; i < nrqs; i++) {
591       j = pa[i];
592       iptr += ict;
593       outdat[j] = iptr;
594       ict       = w1[j];
595     }
596   }
597 
598   /* Form the outgoing messages */
599   /* plug in the headers */
600   for (i = 0; i < nrqs; i++) {
601     j            = pa[i];
602     outdat[j][0] = 0;
603     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
604     ptr[j] = outdat[j] + 2 * w3[j] + 1;
605   }
606 
607   /* Memory for doing local proc's work */
608   {
609     PetscInt M_BPB_imax = 0;
610 #if defined(PETSC_USE_CTABLE)
611     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
612     PetscCall(PetscMalloc1(imax, &table_data));
613     for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
614     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
615     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
616 #else
617     PetscInt Mimax = 0;
618     PetscCall(PetscIntMultError(M, imax, &Mimax));
619     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
620     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
621     for (i = 0; i < imax; i++) {
622       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
623       data[i]  = d_p + M * i;
624     }
625 #endif
626   }
627 
628   /* Parse the IS and update local tables and the outgoing buf with the data */
629   {
630     PetscInt n_i, isz_i, *outdat_j, ctr_j;
631     PetscBT  table_i;
632 
633     for (i = 0; i < imax; i++) {
634       PetscCall(PetscArrayzero(ctr, size));
635       n_i     = n[i];
636       table_i = table[i];
637       idx_i   = idx[i];
638 #if defined(PETSC_USE_CTABLE)
639       table_data_i = table_data[i];
640 #else
641       data_i = data[i];
642 #endif
643       isz_i = isz[i];
644       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
645         row = idx_i[j];
646         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
647         if (proc != rank) { /* copy to the outgoing buffer */
648           ctr[proc]++;
649           *ptr[proc] = row;
650           ptr[proc]++;
651         } else if (!PetscBTLookupSet(table_i, row)) {
652 #if defined(PETSC_USE_CTABLE)
653           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
654 #else
655           data_i[isz_i] = row; /* Update the local table */
656 #endif
657           isz_i++;
658         }
659       }
660       /* Update the headers for the current IS */
661       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
662         if ((ctr_j = ctr[j])) {
663           outdat_j            = outdat[j];
664           k                   = ++outdat_j[0];
665           outdat_j[2 * k]     = ctr_j;
666           outdat_j[2 * k - 1] = i;
667         }
668       }
669       isz[i] = isz_i;
670     }
671   }
672 
673   /*  Now  post the sends */
674   PetscCall(PetscMalloc1(nrqs, &s_waits1));
675   for (i = 0; i < nrqs; ++i) {
676     j = pa[i];
677     PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
678   }
679 
680   /* No longer need the original indices */
681   PetscCall(PetscMalloc1(imax, &iscomms));
682   for (i = 0; i < imax; ++i) {
683     PetscCall(ISRestoreIndices(is[i], idx + i));
684     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
685   }
686   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
687 
688   for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
689 
690     /* Do Local work */
691 #if defined(PETSC_USE_CTABLE)
692   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
693 #else
694   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
695 #endif
696 
697   /* Receive messages */
698   PetscCall(PetscMalloc1(nrqr, &recv_status));
699   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
700   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
701 
702   /* Phase 1 sends are complete - deallocate buffers */
703   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
704   PetscCall(PetscFree4(w1, w2, w3, w4));
705 
706   PetscCall(PetscMalloc1(nrqr, &xdata));
707   PetscCall(PetscMalloc1(nrqr, &isz1));
708   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
709   PetscCall(PetscFree(rbuf[0]));
710   PetscCall(PetscFree(rbuf));
711 
712   /* Send the data back */
713   /* Do a global reduction to know the buffer space req for incoming messages */
714   {
715     PetscMPIInt *rw1;
716 
717     PetscCall(PetscCalloc1(size, &rw1));
718 
719     for (i = 0; i < nrqr; ++i) {
720       proc = recv_status[i].MPI_SOURCE;
721 
722       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
723       rw1[proc] = isz1[i];
724     }
725     PetscCall(PetscFree(onodes1));
726     PetscCall(PetscFree(olengths1));
727 
728     /* Determine the number of messages to expect, their lengths, from from-ids */
729     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
730     PetscCall(PetscFree(rw1));
731   }
732   /* Now post the Irecvs corresponding to these messages */
733   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
734 
735   /* Now  post the sends */
736   PetscCall(PetscMalloc1(nrqr, &s_waits2));
737   for (i = 0; i < nrqr; ++i) {
738     j = recv_status[i].MPI_SOURCE;
739     PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
740   }
741 
742   /* receive work done on other processors */
743   {
744     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
745     PetscMPIInt idex;
746     PetscBT     table_i;
747 
748     for (i = 0; i < nrqs; ++i) {
749       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
750       /* Process the message */
751       rbuf2_i = rbuf2[idex];
752       ct1     = 2 * rbuf2_i[0] + 1;
753       jmax    = rbuf2[idex][0];
754       for (j = 1; j <= jmax; j++) {
755         max     = rbuf2_i[2 * j];
756         is_no   = rbuf2_i[2 * j - 1];
757         isz_i   = isz[is_no];
758         table_i = table[is_no];
759 #if defined(PETSC_USE_CTABLE)
760         table_data_i = table_data[is_no];
761 #else
762         data_i = data[is_no];
763 #endif
764         for (k = 0; k < max; k++, ct1++) {
765           row = rbuf2_i[ct1];
766           if (!PetscBTLookupSet(table_i, row)) {
767 #if defined(PETSC_USE_CTABLE)
768             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
769 #else
770             data_i[isz_i] = row;
771 #endif
772             isz_i++;
773           }
774         }
775         isz[is_no] = isz_i;
776       }
777     }
778 
779     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
780   }
781 
782 #if defined(PETSC_USE_CTABLE)
783   tcount_max = 0;
784   for (i = 0; i < imax; ++i) {
785     table_data_i = table_data[i];
786     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
787     if (tcount_max < tcount) tcount_max = tcount;
788   }
789   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
790 #endif
791 
792   for (i = 0; i < imax; ++i) {
793 #if defined(PETSC_USE_CTABLE)
794     PetscHashIter tpos;
795 
796     table_data_i = table_data[i];
797     PetscHashIterBegin(table_data_i, tpos);
798     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
799       PetscHashIterGetKey(table_data_i, tpos, k);
800       PetscHashIterGetVal(table_data_i, tpos, j);
801       PetscHashIterNext(table_data_i, tpos);
802       tdata[--j] = --k;
803     }
804     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
805 #else
806     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
807 #endif
808     PetscCall(PetscCommDestroy(&iscomms[i]));
809   }
810 
811   PetscCall(PetscFree(iscomms));
812   PetscCall(PetscFree(onodes2));
813   PetscCall(PetscFree(olengths2));
814 
815   PetscCall(PetscFree(pa));
816   PetscCall(PetscFree(rbuf2[0]));
817   PetscCall(PetscFree(rbuf2));
818   PetscCall(PetscFree(s_waits1));
819   PetscCall(PetscFree(r_waits1));
820   PetscCall(PetscFree(s_waits2));
821   PetscCall(PetscFree(r_waits2));
822   PetscCall(PetscFree(recv_status));
823   if (xdata) {
824     PetscCall(PetscFree(xdata[0]));
825     PetscCall(PetscFree(xdata));
826   }
827   PetscCall(PetscFree(isz1));
828 #if defined(PETSC_USE_CTABLE)
829   for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
830   PetscCall(PetscFree(table_data));
831   PetscCall(PetscFree(tdata));
832   PetscCall(PetscFree4(table, data, isz, t_p));
833 #else
834   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
835 #endif
836   PetscFunctionReturn(PETSC_SUCCESS);
837 }
838 
839 /*
840    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
841        the work on the local processor.
842 
843      Inputs:
844       C      - MAT_MPIAIJ;
845       imax - total no of index sets processed at a time;
846       table  - an array of char - size = m bits.
847 
848      Output:
849       isz    - array containing the count of the solution elements corresponding
850                to each index set;
851       data or table_data  - pointer to the solutions
852 */
853 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
854 {
855   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
856   Mat         A = c->A, B = c->B;
857   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
858   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
859   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
860   PetscBT     table_i;
861 #if defined(PETSC_USE_CTABLE)
862   PetscHMapI    table_data_i;
863   PetscHashIter tpos;
864   PetscInt      tcount, *tdata;
865 #else
866   PetscInt *data_i;
867 #endif
868 
869   PetscFunctionBegin;
870   rstart = C->rmap->rstart;
871   cstart = C->cmap->rstart;
872   ai     = a->i;
873   aj     = a->j;
874   bi     = b->i;
875   bj     = b->j;
876   garray = c->garray;
877 
878   for (i = 0; i < imax; i++) {
879 #if defined(PETSC_USE_CTABLE)
880     /* copy existing entries of table_data_i into tdata[] */
881     table_data_i = table_data[i];
882     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
883     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
884 
885     PetscCall(PetscMalloc1(tcount, &tdata));
886     PetscHashIterBegin(table_data_i, tpos);
887     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
888       PetscHashIterGetKey(table_data_i, tpos, row);
889       PetscHashIterGetVal(table_data_i, tpos, j);
890       PetscHashIterNext(table_data_i, tpos);
891       tdata[--j] = --row;
892       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
893     }
894 #else
895     data_i = data[i];
896 #endif
897     table_i = table[i];
898     isz_i   = isz[i];
899     max     = isz[i];
900 
901     for (j = 0; j < max; j++) {
902 #if defined(PETSC_USE_CTABLE)
903       row = tdata[j] - rstart;
904 #else
905       row = data_i[j] - rstart;
906 #endif
907       start = ai[row];
908       end   = ai[row + 1];
909       for (k = start; k < end; k++) { /* Amat */
910         val = aj[k] + cstart;
911         if (!PetscBTLookupSet(table_i, val)) {
912 #if defined(PETSC_USE_CTABLE)
913           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
914 #else
915           data_i[isz_i] = val;
916 #endif
917           isz_i++;
918         }
919       }
920       start = bi[row];
921       end   = bi[row + 1];
922       for (k = start; k < end; k++) { /* Bmat */
923         val = garray[bj[k]];
924         if (!PetscBTLookupSet(table_i, val)) {
925 #if defined(PETSC_USE_CTABLE)
926           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
927 #else
928           data_i[isz_i] = val;
929 #endif
930           isz_i++;
931         }
932       }
933     }
934     isz[i] = isz_i;
935 
936 #if defined(PETSC_USE_CTABLE)
937     PetscCall(PetscFree(tdata));
938 #endif
939   }
940   PetscFunctionReturn(PETSC_SUCCESS);
941 }
942 
943 /*
944       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
945          and return the output
946 
947          Input:
948            C    - the matrix
949            nrqr - no of messages being processed.
950            rbuf - an array of pointers to the received requests
951 
952          Output:
953            xdata - array of messages to be sent back
954            isz1  - size of each message
955 
956   For better efficiency perhaps we should malloc separately each xdata[i],
957 then if a remalloc is required we need only copy the data for that one row
958 rather then all previous rows as it is now where a single large chunk of
959 memory is used.
960 
961 */
962 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
963 {
964   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
965   Mat         A = c->A, B = c->B;
966   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
967   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
968   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
969   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
970   PetscInt   *rbuf_i, kmax, rbuf_0;
971   PetscBT     xtable;
972 
973   PetscFunctionBegin;
974   m      = C->rmap->N;
975   rstart = C->rmap->rstart;
976   cstart = C->cmap->rstart;
977   ai     = a->i;
978   aj     = a->j;
979   bi     = b->i;
980   bj     = b->j;
981   garray = c->garray;
982 
983   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
984     rbuf_i = rbuf[i];
985     rbuf_0 = rbuf_i[0];
986     ct += rbuf_0;
987     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
988   }
989 
990   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
991   else max1 = 1;
992   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
993   if (nrqr) {
994     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
995     ++no_malloc;
996   }
997   PetscCall(PetscBTCreate(m, &xtable));
998   PetscCall(PetscArrayzero(isz1, nrqr));
999 
1000   ct3 = 0;
1001   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
1002     rbuf_i = rbuf[i];
1003     rbuf_0 = rbuf_i[0];
1004     ct1    = 2 * rbuf_0 + 1;
1005     ct2    = ct1;
1006     ct3 += ct1;
1007     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1008       PetscCall(PetscBTMemzero(m, xtable));
1009       oct2 = ct2;
1010       kmax = rbuf_i[2 * j];
1011       for (k = 0; k < kmax; k++, ct1++) {
1012         row = rbuf_i[ct1];
1013         if (!PetscBTLookupSet(xtable, row)) {
1014           if (!(ct3 < mem_estimate)) {
1015             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1016             PetscCall(PetscMalloc1(new_estimate, &tmp));
1017             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1018             PetscCall(PetscFree(xdata[0]));
1019             xdata[0]     = tmp;
1020             mem_estimate = new_estimate;
1021             ++no_malloc;
1022             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1023           }
1024           xdata[i][ct2++] = row;
1025           ct3++;
1026         }
1027       }
1028       for (k = oct2, max2 = ct2; k < max2; k++) {
1029         row   = xdata[i][k] - rstart;
1030         start = ai[row];
1031         end   = ai[row + 1];
1032         for (l = start; l < end; l++) {
1033           val = aj[l] + cstart;
1034           if (!PetscBTLookupSet(xtable, val)) {
1035             if (!(ct3 < mem_estimate)) {
1036               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1037               PetscCall(PetscMalloc1(new_estimate, &tmp));
1038               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1039               PetscCall(PetscFree(xdata[0]));
1040               xdata[0]     = tmp;
1041               mem_estimate = new_estimate;
1042               ++no_malloc;
1043               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1044             }
1045             xdata[i][ct2++] = val;
1046             ct3++;
1047           }
1048         }
1049         start = bi[row];
1050         end   = bi[row + 1];
1051         for (l = start; l < end; l++) {
1052           val = garray[bj[l]];
1053           if (!PetscBTLookupSet(xtable, val)) {
1054             if (!(ct3 < mem_estimate)) {
1055               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1056               PetscCall(PetscMalloc1(new_estimate, &tmp));
1057               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1058               PetscCall(PetscFree(xdata[0]));
1059               xdata[0]     = tmp;
1060               mem_estimate = new_estimate;
1061               ++no_malloc;
1062               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1063             }
1064             xdata[i][ct2++] = val;
1065             ct3++;
1066           }
1067         }
1068       }
1069       /* Update the header*/
1070       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1071       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1072     }
1073     xdata[i][0] = rbuf_0;
1074     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1075     isz1[i] = ct2; /* size of each message */
1076   }
1077   PetscCall(PetscBTDestroy(&xtable));
1078   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1079   PetscFunctionReturn(PETSC_SUCCESS);
1080 }
1081 /* -------------------------------------------------------------------------*/
1082 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1083 /*
1084     Every processor gets the entire matrix
1085 */
1086 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1087 {
1088   Mat         B;
1089   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1090   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1091   PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1092   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1093   PetscInt    m, *b_sendj, *garray   = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1094 
1095   PetscFunctionBegin;
1096   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1097   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1098   if (scall == MAT_INITIAL_MATRIX) {
1099     /* ----------------------------------------------------------------
1100          Tell every processor the number of nonzeros per row
1101     */
1102     PetscCall(PetscMalloc1(A->rmap->N, &lens));
1103     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1104     PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1105     for (i = 0; i < size; i++) {
1106       recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1107       displs[i]     = A->rmap->range[i];
1108     }
1109     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1110     /* ---------------------------------------------------------------
1111          Create the sequential matrix of the same type as the local block diagonal
1112     */
1113     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1114     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1115     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1116     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1117     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1118     PetscCall(PetscCalloc1(2, Bin));
1119     **Bin = B;
1120     b     = (Mat_SeqAIJ *)B->data;
1121 
1122     /*--------------------------------------------------------------------
1123        Copy my part of matrix column indices over
1124     */
1125     sendcount  = ad->nz + bd->nz;
1126     jsendbuf   = b->j + b->i[rstarts[rank]];
1127     a_jsendbuf = ad->j;
1128     b_jsendbuf = bd->j;
1129     n          = A->rmap->rend - A->rmap->rstart;
1130     cnt        = 0;
1131     for (i = 0; i < n; i++) {
1132       /* put in lower diagonal portion */
1133       m = bd->i[i + 1] - bd->i[i];
1134       while (m > 0) {
1135         /* is it above diagonal (in bd (compressed) numbering) */
1136         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1137         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1138         m--;
1139       }
1140 
1141       /* put in diagonal portion */
1142       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1143 
1144       /* put in upper diagonal portion */
1145       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1146     }
1147     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1148 
1149     /*--------------------------------------------------------------------
1150        Gather all column indices to all processors
1151     */
1152     for (i = 0; i < size; i++) {
1153       recvcounts[i] = 0;
1154       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1155     }
1156     displs[0] = 0;
1157     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1158     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1159     /*--------------------------------------------------------------------
1160         Assemble the matrix into useable form (note numerical values not yet set)
1161     */
1162     /* set the b->ilen (length of each row) values */
1163     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1164     /* set the b->i indices */
1165     b->i[0] = 0;
1166     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1167     PetscCall(PetscFree(lens));
1168     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1169     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1170 
1171   } else {
1172     B = **Bin;
1173     b = (Mat_SeqAIJ *)B->data;
1174   }
1175 
1176   /*--------------------------------------------------------------------
1177        Copy my part of matrix numerical values into the values location
1178   */
1179   if (flag == MAT_GET_VALUES) {
1180     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1181     MatScalar         *sendbuf, *recvbuf;
1182 
1183     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1184     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1185     sendcount = ad->nz + bd->nz;
1186     sendbuf   = b->a + b->i[rstarts[rank]];
1187     a_sendbuf = ada;
1188     b_sendbuf = bda;
1189     b_sendj   = bd->j;
1190     n         = A->rmap->rend - A->rmap->rstart;
1191     cnt       = 0;
1192     for (i = 0; i < n; i++) {
1193       /* put in lower diagonal portion */
1194       m = bd->i[i + 1] - bd->i[i];
1195       while (m > 0) {
1196         /* is it above diagonal (in bd (compressed) numbering) */
1197         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1198         sendbuf[cnt++] = *b_sendbuf++;
1199         m--;
1200         b_sendj++;
1201       }
1202 
1203       /* put in diagonal portion */
1204       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1205 
1206       /* put in upper diagonal portion */
1207       while (m-- > 0) {
1208         sendbuf[cnt++] = *b_sendbuf++;
1209         b_sendj++;
1210       }
1211     }
1212     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1213 
1214     /* -----------------------------------------------------------------
1215        Gather all numerical values to all processors
1216     */
1217     if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1218     for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1219     displs[0] = 0;
1220     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1221     recvbuf = b->a;
1222     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A)));
1223     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1224     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1225   } /* endof (flag == MAT_GET_VALUES) */
1226   PetscCall(PetscFree2(recvcounts, displs));
1227 
1228   PetscCall(MatPropagateSymmetryOptions(A, B));
1229   PetscFunctionReturn(PETSC_SUCCESS);
1230 }
1231 
1232 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1233 {
1234   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1235   Mat             submat, A = c->A, B = c->B;
1236   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1237   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1238   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1239   const PetscInt *icol, *irow;
1240   PetscInt        nrow, ncol, start;
1241   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1242   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1243   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1244   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1245   PetscInt       *lens, rmax, ncols, *cols, Crow;
1246 #if defined(PETSC_USE_CTABLE)
1247   PetscHMapI cmap, rmap;
1248   PetscInt  *cmap_loc, *rmap_loc;
1249 #else
1250   PetscInt *cmap, *rmap;
1251 #endif
1252   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1253   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1254   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1255   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1256   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1257   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1258   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1259   MPI_Comm      comm;
1260   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1261   PetscMPIInt  *onodes1, *olengths1, idex, end;
1262   Mat_SubSppt  *smatis1;
1263   PetscBool     isrowsorted, iscolsorted;
1264 
1265   PetscFunctionBegin;
1266   PetscValidLogicalCollectiveInt(C, ismax, 2);
1267   PetscValidLogicalCollectiveEnum(C, scall, 5);
1268   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1269   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1270   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1271   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1272   size = c->size;
1273   rank = c->rank;
1274 
1275   PetscCall(ISSorted(iscol[0], &iscolsorted));
1276   PetscCall(ISSorted(isrow[0], &isrowsorted));
1277   PetscCall(ISGetIndices(isrow[0], &irow));
1278   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1279   if (allcolumns) {
1280     icol = NULL;
1281     ncol = C->cmap->N;
1282   } else {
1283     PetscCall(ISGetIndices(iscol[0], &icol));
1284     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1285   }
1286 
1287   if (scall == MAT_INITIAL_MATRIX) {
1288     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1289 
1290     /* Get some new tags to keep the communication clean */
1291     tag1 = ((PetscObject)C)->tag;
1292     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1293     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1294 
1295     /* evaluate communication - mesg to who, length of mesg, and buffer space
1296      required. Based on this, buffers are allocated, and data copied into them */
1297     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1298     PetscCall(PetscMalloc1(nrow, &row2proc));
1299 
1300     /* w1[proc] = num of rows owned by proc -- to be requested */
1301     proc = 0;
1302     nrqs = 0; /* num of outgoing messages */
1303     for (j = 0; j < nrow; j++) {
1304       row = irow[j];
1305       if (!isrowsorted) proc = 0;
1306       while (row >= C->rmap->range[proc + 1]) proc++;
1307       w1[proc]++;
1308       row2proc[j] = proc; /* map row index to proc */
1309 
1310       if (proc != rank && !w2[proc]) {
1311         w2[proc] = 1;
1312         nrqs++;
1313       }
1314     }
1315     w1[rank] = 0; /* rows owned by self will not be requested */
1316 
1317     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1318     for (proc = 0, j = 0; proc < size; proc++) {
1319       if (w1[proc]) pa[j++] = proc;
1320     }
1321 
1322     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1323     msz = 0; /* total mesg length (for all procs) */
1324     for (i = 0; i < nrqs; i++) {
1325       proc = pa[i];
1326       w1[proc] += 3;
1327       msz += w1[proc];
1328     }
1329     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
1330 
1331     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1332     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1333     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1334 
1335     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1336        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1337     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1338 
1339     /* Now post the Irecvs corresponding to these messages */
1340     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1341 
1342     PetscCall(PetscFree(onodes1));
1343     PetscCall(PetscFree(olengths1));
1344 
1345     /* Allocate Memory for outgoing messages */
1346     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1347     PetscCall(PetscArrayzero(sbuf1, size));
1348     PetscCall(PetscArrayzero(ptr, size));
1349 
1350     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1351     iptr = tmp;
1352     for (i = 0; i < nrqs; i++) {
1353       proc        = pa[i];
1354       sbuf1[proc] = iptr;
1355       iptr += w1[proc];
1356     }
1357 
1358     /* Form the outgoing messages */
1359     /* Initialize the header space */
1360     for (i = 0; i < nrqs; i++) {
1361       proc = pa[i];
1362       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1363       ptr[proc] = sbuf1[proc] + 3;
1364     }
1365 
1366     /* Parse the isrow and copy data into outbuf */
1367     PetscCall(PetscArrayzero(ctr, size));
1368     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1369       proc = row2proc[j];
1370       if (proc != rank) { /* copy to the outgoing buf*/
1371         *ptr[proc] = irow[j];
1372         ctr[proc]++;
1373         ptr[proc]++;
1374       }
1375     }
1376 
1377     /* Update the headers for the current IS */
1378     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1379       if ((ctr_j = ctr[j])) {
1380         sbuf1_j            = sbuf1[j];
1381         k                  = ++sbuf1_j[0];
1382         sbuf1_j[2 * k]     = ctr_j;
1383         sbuf1_j[2 * k - 1] = 0;
1384       }
1385     }
1386 
1387     /* Now post the sends */
1388     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1389     for (i = 0; i < nrqs; ++i) {
1390       proc = pa[i];
1391       PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1392     }
1393 
1394     /* Post Receives to capture the buffer size */
1395     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1396     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1397 
1398     if (nrqs) rbuf2[0] = tmp + msz;
1399     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1400 
1401     for (i = 0; i < nrqs; ++i) {
1402       proc = pa[i];
1403       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1404     }
1405 
1406     PetscCall(PetscFree2(w1, w2));
1407 
1408     /* Send to other procs the buf size they should allocate */
1409     /* Receive messages*/
1410     PetscCall(PetscMalloc1(nrqr, &r_status1));
1411     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1412 
1413     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1414     for (i = 0; i < nrqr; ++i) {
1415       req_size[i] = 0;
1416       rbuf1_i     = rbuf1[i];
1417       start       = 2 * rbuf1_i[0] + 1;
1418       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1419       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1420       sbuf2_i = sbuf2[i];
1421       for (j = start; j < end; j++) {
1422         k          = rbuf1_i[j] - rstart;
1423         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1424         sbuf2_i[j] = ncols;
1425         req_size[i] += ncols;
1426       }
1427       req_source1[i] = r_status1[i].MPI_SOURCE;
1428 
1429       /* form the header */
1430       sbuf2_i[0] = req_size[i];
1431       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1432 
1433       PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1434     }
1435 
1436     PetscCall(PetscFree(r_status1));
1437     PetscCall(PetscFree(r_waits1));
1438 
1439     /* rbuf2 is received, Post recv column indices a->j */
1440     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1441 
1442     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1443     for (i = 0; i < nrqs; ++i) {
1444       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1445       req_source2[i] = r_status2[i].MPI_SOURCE;
1446       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1447     }
1448 
1449     /* Wait on sends1 and sends2 */
1450     PetscCall(PetscMalloc1(nrqs, &s_status1));
1451     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1452     PetscCall(PetscFree(s_waits1));
1453     PetscCall(PetscFree(s_status1));
1454 
1455     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1456     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1457 
1458     /* Now allocate sending buffers for a->j, and send them off */
1459     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1460     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1461     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1462     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1463 
1464     for (i = 0; i < nrqr; i++) { /* for each requested message */
1465       rbuf1_i   = rbuf1[i];
1466       sbuf_aj_i = sbuf_aj[i];
1467       ct1       = 2 * rbuf1_i[0] + 1;
1468       ct2       = 0;
1469       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1470 
1471       kmax = rbuf1[i][2];
1472       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1473         row    = rbuf1_i[ct1] - rstart;
1474         nzA    = ai[row + 1] - ai[row];
1475         nzB    = bi[row + 1] - bi[row];
1476         ncols  = nzA + nzB;
1477         cworkA = aj + ai[row];
1478         cworkB = bj + bi[row];
1479 
1480         /* load the column indices for this row into cols*/
1481         cols = sbuf_aj_i + ct2;
1482 
1483         lwrite = 0;
1484         for (l = 0; l < nzB; l++) {
1485           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1486         }
1487         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1488         for (l = 0; l < nzB; l++) {
1489           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1490         }
1491 
1492         ct2 += ncols;
1493       }
1494       PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1495     }
1496 
1497     /* create column map (cmap): global col of C -> local col of submat */
1498 #if defined(PETSC_USE_CTABLE)
1499     if (!allcolumns) {
1500       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1501       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1502       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1503         if (icol[j] >= cstart && icol[j] < cend) {
1504           cmap_loc[icol[j] - cstart] = j + 1;
1505         } else { /* use PetscHMapI for non-local col indices */
1506           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1507         }
1508       }
1509     } else {
1510       cmap     = NULL;
1511       cmap_loc = NULL;
1512     }
1513     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1514 #else
1515     if (!allcolumns) {
1516       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1517       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1518     } else {
1519       cmap = NULL;
1520     }
1521 #endif
1522 
1523     /* Create lens for MatSeqAIJSetPreallocation() */
1524     PetscCall(PetscCalloc1(nrow, &lens));
1525 
1526     /* Compute lens from local part of C */
1527     for (j = 0; j < nrow; j++) {
1528       row  = irow[j];
1529       proc = row2proc[j];
1530       if (proc == rank) {
1531         /* diagonal part A = c->A */
1532         ncols = ai[row - rstart + 1] - ai[row - rstart];
1533         cols  = aj + ai[row - rstart];
1534         if (!allcolumns) {
1535           for (k = 0; k < ncols; k++) {
1536 #if defined(PETSC_USE_CTABLE)
1537             tcol = cmap_loc[cols[k]];
1538 #else
1539             tcol = cmap[cols[k] + cstart];
1540 #endif
1541             if (tcol) lens[j]++;
1542           }
1543         } else { /* allcolumns */
1544           lens[j] = ncols;
1545         }
1546 
1547         /* off-diagonal part B = c->B */
1548         ncols = bi[row - rstart + 1] - bi[row - rstart];
1549         cols  = bj + bi[row - rstart];
1550         if (!allcolumns) {
1551           for (k = 0; k < ncols; k++) {
1552 #if defined(PETSC_USE_CTABLE)
1553             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1554 #else
1555             tcol = cmap[bmap[cols[k]]];
1556 #endif
1557             if (tcol) lens[j]++;
1558           }
1559         } else { /* allcolumns */
1560           lens[j] += ncols;
1561         }
1562       }
1563     }
1564 
1565     /* Create row map (rmap): global row of C -> local row of submat */
1566 #if defined(PETSC_USE_CTABLE)
1567     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1568     for (j = 0; j < nrow; j++) {
1569       row  = irow[j];
1570       proc = row2proc[j];
1571       if (proc == rank) { /* a local row */
1572         rmap_loc[row - rstart] = j;
1573       } else {
1574         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1575       }
1576     }
1577 #else
1578     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1579     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1580 #endif
1581 
1582     /* Update lens from offproc data */
1583     /* recv a->j is done */
1584     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1585     for (i = 0; i < nrqs; i++) {
1586       proc    = pa[i];
1587       sbuf1_i = sbuf1[proc];
1588       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1589       ct1     = 2 + 1;
1590       ct2     = 0;
1591       rbuf2_i = rbuf2[i]; /* received length of C->j */
1592       rbuf3_i = rbuf3[i]; /* received C->j */
1593 
1594       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1595       max1 = sbuf1_i[2];
1596       for (k = 0; k < max1; k++, ct1++) {
1597 #if defined(PETSC_USE_CTABLE)
1598         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1599         row--;
1600         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1601 #else
1602         row = rmap[sbuf1_i[ct1]];      /* the row index in submat */
1603 #endif
1604         /* Now, store row index of submat in sbuf1_i[ct1] */
1605         sbuf1_i[ct1] = row;
1606 
1607         nnz = rbuf2_i[ct1];
1608         if (!allcolumns) {
1609           for (l = 0; l < nnz; l++, ct2++) {
1610 #if defined(PETSC_USE_CTABLE)
1611             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1612               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1613             } else {
1614               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1615             }
1616 #else
1617             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1618 #endif
1619             if (tcol) lens[row]++;
1620           }
1621         } else { /* allcolumns */
1622           lens[row] += nnz;
1623         }
1624       }
1625     }
1626     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1627     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1628 
1629     /* Create the submatrices */
1630     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1631     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1632 
1633     PetscCall(ISGetBlockSize(isrow[0], &i));
1634     PetscCall(ISGetBlockSize(iscol[0], &j));
1635     PetscCall(MatSetBlockSizes(submat, i, j));
1636     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1637     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1638 
1639     /* create struct Mat_SubSppt and attached it to submat */
1640     PetscCall(PetscNew(&smatis1));
1641     subc            = (Mat_SeqAIJ *)submat->data;
1642     subc->submatis1 = smatis1;
1643 
1644     smatis1->id          = 0;
1645     smatis1->nrqs        = nrqs;
1646     smatis1->nrqr        = nrqr;
1647     smatis1->rbuf1       = rbuf1;
1648     smatis1->rbuf2       = rbuf2;
1649     smatis1->rbuf3       = rbuf3;
1650     smatis1->sbuf2       = sbuf2;
1651     smatis1->req_source2 = req_source2;
1652 
1653     smatis1->sbuf1 = sbuf1;
1654     smatis1->ptr   = ptr;
1655     smatis1->tmp   = tmp;
1656     smatis1->ctr   = ctr;
1657 
1658     smatis1->pa          = pa;
1659     smatis1->req_size    = req_size;
1660     smatis1->req_source1 = req_source1;
1661 
1662     smatis1->allcolumns = allcolumns;
1663     smatis1->singleis   = PETSC_TRUE;
1664     smatis1->row2proc   = row2proc;
1665     smatis1->rmap       = rmap;
1666     smatis1->cmap       = cmap;
1667 #if defined(PETSC_USE_CTABLE)
1668     smatis1->rmap_loc = rmap_loc;
1669     smatis1->cmap_loc = cmap_loc;
1670 #endif
1671 
1672     smatis1->destroy     = submat->ops->destroy;
1673     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1674     submat->factortype   = C->factortype;
1675 
1676     /* compute rmax */
1677     rmax = 0;
1678     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1679 
1680   } else { /* scall == MAT_REUSE_MATRIX */
1681     submat = submats[0];
1682     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1683 
1684     subc        = (Mat_SeqAIJ *)submat->data;
1685     rmax        = subc->rmax;
1686     smatis1     = subc->submatis1;
1687     nrqs        = smatis1->nrqs;
1688     nrqr        = smatis1->nrqr;
1689     rbuf1       = smatis1->rbuf1;
1690     rbuf2       = smatis1->rbuf2;
1691     rbuf3       = smatis1->rbuf3;
1692     req_source2 = smatis1->req_source2;
1693 
1694     sbuf1 = smatis1->sbuf1;
1695     sbuf2 = smatis1->sbuf2;
1696     ptr   = smatis1->ptr;
1697     tmp   = smatis1->tmp;
1698     ctr   = smatis1->ctr;
1699 
1700     pa          = smatis1->pa;
1701     req_size    = smatis1->req_size;
1702     req_source1 = smatis1->req_source1;
1703 
1704     allcolumns = smatis1->allcolumns;
1705     row2proc   = smatis1->row2proc;
1706     rmap       = smatis1->rmap;
1707     cmap       = smatis1->cmap;
1708 #if defined(PETSC_USE_CTABLE)
1709     rmap_loc = smatis1->rmap_loc;
1710     cmap_loc = smatis1->cmap_loc;
1711 #endif
1712   }
1713 
1714   /* Post recv matrix values */
1715   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1716   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1717   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1718   for (i = 0; i < nrqs; ++i) {
1719     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1720     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1721   }
1722 
1723   /* Allocate sending buffers for a->a, and send them off */
1724   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1725   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1726   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1727   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1728 
1729   for (i = 0; i < nrqr; i++) {
1730     rbuf1_i   = rbuf1[i];
1731     sbuf_aa_i = sbuf_aa[i];
1732     ct1       = 2 * rbuf1_i[0] + 1;
1733     ct2       = 0;
1734     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1735 
1736     kmax = rbuf1_i[2];
1737     for (k = 0; k < kmax; k++, ct1++) {
1738       row    = rbuf1_i[ct1] - rstart;
1739       nzA    = ai[row + 1] - ai[row];
1740       nzB    = bi[row + 1] - bi[row];
1741       ncols  = nzA + nzB;
1742       cworkB = bj + bi[row];
1743       vworkA = a_a + ai[row];
1744       vworkB = b_a + bi[row];
1745 
1746       /* load the column values for this row into vals*/
1747       vals = sbuf_aa_i + ct2;
1748 
1749       lwrite = 0;
1750       for (l = 0; l < nzB; l++) {
1751         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1752       }
1753       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1754       for (l = 0; l < nzB; l++) {
1755         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1756       }
1757 
1758       ct2 += ncols;
1759     }
1760     PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1761   }
1762 
1763   /* Assemble submat */
1764   /* First assemble the local rows */
1765   for (j = 0; j < nrow; j++) {
1766     row  = irow[j];
1767     proc = row2proc[j];
1768     if (proc == rank) {
1769       Crow = row - rstart; /* local row index of C */
1770 #if defined(PETSC_USE_CTABLE)
1771       row = rmap_loc[Crow]; /* row index of submat */
1772 #else
1773       row = rmap[row];
1774 #endif
1775 
1776       if (allcolumns) {
1777         /* diagonal part A = c->A */
1778         ncols = ai[Crow + 1] - ai[Crow];
1779         cols  = aj + ai[Crow];
1780         vals  = a_a + ai[Crow];
1781         i     = 0;
1782         for (k = 0; k < ncols; k++) {
1783           subcols[i]   = cols[k] + cstart;
1784           subvals[i++] = vals[k];
1785         }
1786 
1787         /* off-diagonal part B = c->B */
1788         ncols = bi[Crow + 1] - bi[Crow];
1789         cols  = bj + bi[Crow];
1790         vals  = b_a + bi[Crow];
1791         for (k = 0; k < ncols; k++) {
1792           subcols[i]   = bmap[cols[k]];
1793           subvals[i++] = vals[k];
1794         }
1795 
1796         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1797 
1798       } else { /* !allcolumns */
1799 #if defined(PETSC_USE_CTABLE)
1800         /* diagonal part A = c->A */
1801         ncols = ai[Crow + 1] - ai[Crow];
1802         cols  = aj + ai[Crow];
1803         vals  = a_a + ai[Crow];
1804         i     = 0;
1805         for (k = 0; k < ncols; k++) {
1806           tcol = cmap_loc[cols[k]];
1807           if (tcol) {
1808             subcols[i]   = --tcol;
1809             subvals[i++] = vals[k];
1810           }
1811         }
1812 
1813         /* off-diagonal part B = c->B */
1814         ncols = bi[Crow + 1] - bi[Crow];
1815         cols  = bj + bi[Crow];
1816         vals  = b_a + bi[Crow];
1817         for (k = 0; k < ncols; k++) {
1818           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1819           if (tcol) {
1820             subcols[i]   = --tcol;
1821             subvals[i++] = vals[k];
1822           }
1823         }
1824 #else
1825         /* diagonal part A = c->A */
1826         ncols = ai[Crow + 1] - ai[Crow];
1827         cols  = aj + ai[Crow];
1828         vals  = a_a + ai[Crow];
1829         i     = 0;
1830         for (k = 0; k < ncols; k++) {
1831           tcol = cmap[cols[k] + cstart];
1832           if (tcol) {
1833             subcols[i]   = --tcol;
1834             subvals[i++] = vals[k];
1835           }
1836         }
1837 
1838         /* off-diagonal part B = c->B */
1839         ncols = bi[Crow + 1] - bi[Crow];
1840         cols  = bj + bi[Crow];
1841         vals  = b_a + bi[Crow];
1842         for (k = 0; k < ncols; k++) {
1843           tcol = cmap[bmap[cols[k]]];
1844           if (tcol) {
1845             subcols[i]   = --tcol;
1846             subvals[i++] = vals[k];
1847           }
1848         }
1849 #endif
1850         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1851       }
1852     }
1853   }
1854 
1855   /* Now assemble the off-proc rows */
1856   for (i = 0; i < nrqs; i++) { /* for each requested message */
1857     /* recv values from other processes */
1858     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1859     proc    = pa[idex];
1860     sbuf1_i = sbuf1[proc];
1861     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1862     ct1     = 2 + 1;
1863     ct2     = 0;           /* count of received C->j */
1864     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1865     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1866     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1867     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1868 
1869     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1870     max1 = sbuf1_i[2];                  /* num of rows */
1871     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1872       row = sbuf1_i[ct1];               /* row index of submat */
1873       if (!allcolumns) {
1874         idex = 0;
1875         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1876           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1877           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1878 #if defined(PETSC_USE_CTABLE)
1879             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1880               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1881             } else {
1882               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1883             }
1884 #else
1885             tcol = cmap[rbuf3_i[ct2]];
1886 #endif
1887             if (tcol) {
1888               subcols[idex]   = --tcol; /* may not be sorted */
1889               subvals[idex++] = rbuf4_i[ct2];
1890 
1891               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1892                For reuse, we replace received C->j with index that should be inserted to submat */
1893               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1894             }
1895           }
1896           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1897         } else { /* scall == MAT_REUSE_MATRIX */
1898           submat = submats[0];
1899           subc   = (Mat_SeqAIJ *)submat->data;
1900 
1901           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1902           for (l = 0; l < nnz; l++) {
1903             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1904             subvals[idex++] = rbuf4_i[ct2];
1905           }
1906 
1907           bj = subc->j + subc->i[row]; /* sorted column indices */
1908           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1909         }
1910       } else {              /* allcolumns */
1911         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1912         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES));
1913         ct2 += nnz;
1914       }
1915     }
1916   }
1917 
1918   /* sending a->a are done */
1919   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1920   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1921 
1922   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1923   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1924   submats[0] = submat;
1925 
1926   /* Restore the indices */
1927   PetscCall(ISRestoreIndices(isrow[0], &irow));
1928   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1929 
1930   /* Destroy allocated memory */
1931   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1932   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1933   if (sbuf_aa) {
1934     PetscCall(PetscFree(sbuf_aa[0]));
1935     PetscCall(PetscFree(sbuf_aa));
1936   }
1937 
1938   if (scall == MAT_INITIAL_MATRIX) {
1939     PetscCall(PetscFree(lens));
1940     if (sbuf_aj) {
1941       PetscCall(PetscFree(sbuf_aj[0]));
1942       PetscCall(PetscFree(sbuf_aj));
1943     }
1944   }
1945   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1946   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1947   PetscFunctionReturn(PETSC_SUCCESS);
1948 }
1949 
1950 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1951 {
1952   PetscInt  ncol;
1953   PetscBool colflag, allcolumns = PETSC_FALSE;
1954 
1955   PetscFunctionBegin;
1956   /* Allocate memory to hold all the submatrices */
1957   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1958 
1959   /* Check for special case: each processor gets entire matrix columns */
1960   PetscCall(ISIdentity(iscol[0], &colflag));
1961   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1962   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1963 
1964   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1965   PetscFunctionReturn(PETSC_SUCCESS);
1966 }
1967 
1968 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1969 {
1970   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1971   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1972   Mat_SeqAIJ  *subc;
1973   Mat_SubSppt *smat;
1974 
1975   PetscFunctionBegin;
1976   /* Check for special case: each processor has a single IS */
1977   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1978     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1979     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1980     PetscFunctionReturn(PETSC_SUCCESS);
1981   }
1982 
1983   /* Collect global wantallmatrix and nstages */
1984   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1985   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1986   if (!nmax) nmax = 1;
1987 
1988   if (scall == MAT_INITIAL_MATRIX) {
1989     /* Collect global wantallmatrix and nstages */
1990     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1991       PetscCall(ISIdentity(*isrow, &rowflag));
1992       PetscCall(ISIdentity(*iscol, &colflag));
1993       PetscCall(ISGetLocalSize(*isrow, &nrow));
1994       PetscCall(ISGetLocalSize(*iscol, &ncol));
1995       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1996         wantallmatrix = PETSC_TRUE;
1997 
1998         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1999       }
2000     }
2001 
2002     /* Determine the number of stages through which submatrices are done
2003        Each stage will extract nmax submatrices.
2004        nmax is determined by the matrix column dimension.
2005        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2006     */
2007     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2008 
2009     in[0] = -1 * (PetscInt)wantallmatrix;
2010     in[1] = nstages;
2011     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
2012     wantallmatrix = (PetscBool)(-out[0]);
2013     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2014 
2015   } else { /* MAT_REUSE_MATRIX */
2016     if (ismax) {
2017       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2018       smat = subc->submatis1;
2019     } else { /* (*submat)[0] is a dummy matrix */
2020       smat = (Mat_SubSppt *)(*submat)[0]->data;
2021     }
2022     if (!smat) {
2023       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2024       wantallmatrix = PETSC_TRUE;
2025     } else if (smat->singleis) {
2026       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2027       PetscFunctionReturn(PETSC_SUCCESS);
2028     } else {
2029       nstages = smat->nstages;
2030     }
2031   }
2032 
2033   if (wantallmatrix) {
2034     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2035     PetscFunctionReturn(PETSC_SUCCESS);
2036   }
2037 
2038   /* Allocate memory to hold all the submatrices and dummy submatrices */
2039   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2040 
2041   for (i = 0, pos = 0; i < nstages; i++) {
2042     if (pos + nmax <= ismax) max_no = nmax;
2043     else if (pos >= ismax) max_no = 0;
2044     else max_no = ismax - pos;
2045 
2046     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos));
2047     if (!max_no) {
2048       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2049         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2050         smat->nstages = nstages;
2051       }
2052       pos++; /* advance to next dummy matrix if any */
2053     } else pos += max_no;
2054   }
2055 
2056   if (ismax && scall == MAT_INITIAL_MATRIX) {
2057     /* save nstages for reuse */
2058     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2059     smat          = subc->submatis1;
2060     smat->nstages = nstages;
2061   }
2062   PetscFunctionReturn(PETSC_SUCCESS);
2063 }
2064 
2065 /* -------------------------------------------------------------------------*/
2066 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2067 {
2068   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2069   Mat              A = c->A;
2070   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2071   const PetscInt **icol, **irow;
2072   PetscInt        *nrow, *ncol, start;
2073   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2074   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2075   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2076   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2077   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2078 #if defined(PETSC_USE_CTABLE)
2079   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2080 #else
2081   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2082 #endif
2083   const PetscInt *irow_i;
2084   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2085   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2086   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2087   MPI_Comm        comm;
2088   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2089   PetscMPIInt    *onodes1, *olengths1, end;
2090   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2091   Mat_SubSppt    *smat_i;
2092   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2093   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2094 
2095   PetscFunctionBegin;
2096   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2097   size = c->size;
2098   rank = c->rank;
2099 
2100   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2101   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2102 
2103   for (i = 0; i < ismax; i++) {
2104     PetscCall(ISSorted(iscol[i], &issorted[i]));
2105     if (!issorted[i]) iscsorted = issorted[i];
2106 
2107     PetscCall(ISSorted(isrow[i], &issorted[i]));
2108 
2109     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2110     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2111 
2112     /* Check for special case: allcolumn */
2113     PetscCall(ISIdentity(iscol[i], &colflag));
2114     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2115     if (colflag && ncol[i] == C->cmap->N) {
2116       allcolumns[i] = PETSC_TRUE;
2117       icol[i]       = NULL;
2118     } else {
2119       allcolumns[i] = PETSC_FALSE;
2120       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2121     }
2122   }
2123 
2124   if (scall == MAT_REUSE_MATRIX) {
2125     /* Assumes new rows are same length as the old rows */
2126     for (i = 0; i < ismax; i++) {
2127       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2128       subc = (Mat_SeqAIJ *)submats[i]->data;
2129       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2130 
2131       /* Initial matrix as if empty */
2132       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2133 
2134       smat_i = subc->submatis1;
2135 
2136       nrqs        = smat_i->nrqs;
2137       nrqr        = smat_i->nrqr;
2138       rbuf1       = smat_i->rbuf1;
2139       rbuf2       = smat_i->rbuf2;
2140       rbuf3       = smat_i->rbuf3;
2141       req_source2 = smat_i->req_source2;
2142 
2143       sbuf1 = smat_i->sbuf1;
2144       sbuf2 = smat_i->sbuf2;
2145       ptr   = smat_i->ptr;
2146       tmp   = smat_i->tmp;
2147       ctr   = smat_i->ctr;
2148 
2149       pa          = smat_i->pa;
2150       req_size    = smat_i->req_size;
2151       req_source1 = smat_i->req_source1;
2152 
2153       allcolumns[i] = smat_i->allcolumns;
2154       row2proc[i]   = smat_i->row2proc;
2155       rmap[i]       = smat_i->rmap;
2156       cmap[i]       = smat_i->cmap;
2157     }
2158 
2159     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2160       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2161       smat_i = (Mat_SubSppt *)submats[0]->data;
2162 
2163       nrqs        = smat_i->nrqs;
2164       nrqr        = smat_i->nrqr;
2165       rbuf1       = smat_i->rbuf1;
2166       rbuf2       = smat_i->rbuf2;
2167       rbuf3       = smat_i->rbuf3;
2168       req_source2 = smat_i->req_source2;
2169 
2170       sbuf1 = smat_i->sbuf1;
2171       sbuf2 = smat_i->sbuf2;
2172       ptr   = smat_i->ptr;
2173       tmp   = smat_i->tmp;
2174       ctr   = smat_i->ctr;
2175 
2176       pa          = smat_i->pa;
2177       req_size    = smat_i->req_size;
2178       req_source1 = smat_i->req_source1;
2179 
2180       allcolumns[0] = PETSC_FALSE;
2181     }
2182   } else { /* scall == MAT_INITIAL_MATRIX */
2183     /* Get some new tags to keep the communication clean */
2184     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2185     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2186 
2187     /* evaluate communication - mesg to who, length of mesg, and buffer space
2188      required. Based on this, buffers are allocated, and data copied into them*/
2189     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2190 
2191     for (i = 0; i < ismax; i++) {
2192       jmax   = nrow[i];
2193       irow_i = irow[i];
2194 
2195       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2196       row2proc[i] = row2proc_i;
2197 
2198       if (issorted[i]) proc = 0;
2199       for (j = 0; j < jmax; j++) {
2200         if (!issorted[i]) proc = 0;
2201         row = irow_i[j];
2202         while (row >= C->rmap->range[proc + 1]) proc++;
2203         w4[proc]++;
2204         row2proc_i[j] = proc; /* map row index to proc */
2205       }
2206       for (j = 0; j < size; j++) {
2207         if (w4[j]) {
2208           w1[j] += w4[j];
2209           w3[j]++;
2210           w4[j] = 0;
2211         }
2212       }
2213     }
2214 
2215     nrqs     = 0; /* no of outgoing messages */
2216     msz      = 0; /* total mesg length (for all procs) */
2217     w1[rank] = 0; /* no mesg sent to self */
2218     w3[rank] = 0;
2219     for (i = 0; i < size; i++) {
2220       if (w1[i]) {
2221         w2[i] = 1;
2222         nrqs++;
2223       } /* there exists a message to proc i */
2224     }
2225     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2226     for (i = 0, j = 0; i < size; i++) {
2227       if (w1[i]) {
2228         pa[j] = i;
2229         j++;
2230       }
2231     }
2232 
2233     /* Each message would have a header = 1 + 2*(no of IS) + data */
2234     for (i = 0; i < nrqs; i++) {
2235       j = pa[i];
2236       w1[j] += w2[j] + 2 * w3[j];
2237       msz += w1[j];
2238     }
2239     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
2240 
2241     /* Determine the number of messages to expect, their lengths, from from-ids */
2242     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2243     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2244 
2245     /* Now post the Irecvs corresponding to these messages */
2246     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2247     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2248 
2249     /* Allocate Memory for outgoing messages */
2250     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2251     PetscCall(PetscArrayzero(sbuf1, size));
2252     PetscCall(PetscArrayzero(ptr, size));
2253 
2254     {
2255       PetscInt *iptr = tmp;
2256       k              = 0;
2257       for (i = 0; i < nrqs; i++) {
2258         j = pa[i];
2259         iptr += k;
2260         sbuf1[j] = iptr;
2261         k        = w1[j];
2262       }
2263     }
2264 
2265     /* Form the outgoing messages. Initialize the header space */
2266     for (i = 0; i < nrqs; i++) {
2267       j           = pa[i];
2268       sbuf1[j][0] = 0;
2269       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2270       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2271     }
2272 
2273     /* Parse the isrow and copy data into outbuf */
2274     for (i = 0; i < ismax; i++) {
2275       row2proc_i = row2proc[i];
2276       PetscCall(PetscArrayzero(ctr, size));
2277       irow_i = irow[i];
2278       jmax   = nrow[i];
2279       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2280         proc = row2proc_i[j];
2281         if (proc != rank) { /* copy to the outgoing buf*/
2282           ctr[proc]++;
2283           *ptr[proc] = irow_i[j];
2284           ptr[proc]++;
2285         }
2286       }
2287       /* Update the headers for the current IS */
2288       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2289         if ((ctr_j = ctr[j])) {
2290           sbuf1_j            = sbuf1[j];
2291           k                  = ++sbuf1_j[0];
2292           sbuf1_j[2 * k]     = ctr_j;
2293           sbuf1_j[2 * k - 1] = i;
2294         }
2295       }
2296     }
2297 
2298     /*  Now  post the sends */
2299     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2300     for (i = 0; i < nrqs; ++i) {
2301       j = pa[i];
2302       PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2303     }
2304 
2305     /* Post Receives to capture the buffer size */
2306     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2307     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2308     if (nrqs) rbuf2[0] = tmp + msz;
2309     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2310     for (i = 0; i < nrqs; ++i) {
2311       j = pa[i];
2312       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2313     }
2314 
2315     /* Send to other procs the buf size they should allocate */
2316     /* Receive messages*/
2317     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2318     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2319     {
2320       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2321       PetscInt *sbuf2_i;
2322 
2323       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2324       for (i = 0; i < nrqr; ++i) {
2325         req_size[i] = 0;
2326         rbuf1_i     = rbuf1[i];
2327         start       = 2 * rbuf1_i[0] + 1;
2328         end         = olengths1[i];
2329         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2330         sbuf2_i = sbuf2[i];
2331         for (j = start; j < end; j++) {
2332           id         = rbuf1_i[j] - rstart;
2333           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2334           sbuf2_i[j] = ncols;
2335           req_size[i] += ncols;
2336         }
2337         req_source1[i] = onodes1[i];
2338         /* form the header */
2339         sbuf2_i[0] = req_size[i];
2340         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2341 
2342         PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2343       }
2344     }
2345 
2346     PetscCall(PetscFree(onodes1));
2347     PetscCall(PetscFree(olengths1));
2348     PetscCall(PetscFree(r_waits1));
2349     PetscCall(PetscFree4(w1, w2, w3, w4));
2350 
2351     /* Receive messages*/
2352     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2353     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2354     for (i = 0; i < nrqs; ++i) {
2355       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2356       req_source2[i] = pa[i];
2357       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2358     }
2359     PetscCall(PetscFree(r_waits2));
2360 
2361     /* Wait on sends1 and sends2 */
2362     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2363     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2364     PetscCall(PetscFree(s_waits1));
2365     PetscCall(PetscFree(s_waits2));
2366 
2367     /* Now allocate sending buffers for a->j, and send them off */
2368     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2369     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2370     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2371     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2372 
2373     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2374     {
2375       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2376       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2377       PetscInt  cend = C->cmap->rend;
2378       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2379 
2380       for (i = 0; i < nrqr; i++) {
2381         rbuf1_i   = rbuf1[i];
2382         sbuf_aj_i = sbuf_aj[i];
2383         ct1       = 2 * rbuf1_i[0] + 1;
2384         ct2       = 0;
2385         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2386           kmax = rbuf1[i][2 * j];
2387           for (k = 0; k < kmax; k++, ct1++) {
2388             row    = rbuf1_i[ct1] - rstart;
2389             nzA    = a_i[row + 1] - a_i[row];
2390             nzB    = b_i[row + 1] - b_i[row];
2391             ncols  = nzA + nzB;
2392             cworkA = a_j + a_i[row];
2393             cworkB = b_j + b_i[row];
2394 
2395             /* load the column indices for this row into cols */
2396             cols = sbuf_aj_i + ct2;
2397 
2398             lwrite = 0;
2399             for (l = 0; l < nzB; l++) {
2400               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2401             }
2402             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2403             for (l = 0; l < nzB; l++) {
2404               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2405             }
2406 
2407             ct2 += ncols;
2408           }
2409         }
2410         PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2411       }
2412     }
2413 
2414     /* create col map: global col of C -> local col of submatrices */
2415     {
2416       const PetscInt *icol_i;
2417 #if defined(PETSC_USE_CTABLE)
2418       for (i = 0; i < ismax; i++) {
2419         if (!allcolumns[i]) {
2420           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2421 
2422           jmax   = ncol[i];
2423           icol_i = icol[i];
2424           cmap_i = cmap[i];
2425           for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2426         } else cmap[i] = NULL;
2427       }
2428 #else
2429       for (i = 0; i < ismax; i++) {
2430         if (!allcolumns[i]) {
2431           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2432           jmax   = ncol[i];
2433           icol_i = icol[i];
2434           cmap_i = cmap[i];
2435           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2436         } else cmap[i] = NULL;
2437       }
2438 #endif
2439     }
2440 
2441     /* Create lens which is required for MatCreate... */
2442     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2443     PetscCall(PetscMalloc1(ismax, &lens));
2444 
2445     if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2446     for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];
2447 
2448     /* Update lens from local data */
2449     for (i = 0; i < ismax; i++) {
2450       row2proc_i = row2proc[i];
2451       jmax       = nrow[i];
2452       if (!allcolumns[i]) cmap_i = cmap[i];
2453       irow_i = irow[i];
2454       lens_i = lens[i];
2455       for (j = 0; j < jmax; j++) {
2456         row  = irow_i[j];
2457         proc = row2proc_i[j];
2458         if (proc == rank) {
2459           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2460           if (!allcolumns[i]) {
2461             for (k = 0; k < ncols; k++) {
2462 #if defined(PETSC_USE_CTABLE)
2463               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2464 #else
2465               tcol = cmap_i[cols[k]];
2466 #endif
2467               if (tcol) lens_i[j]++;
2468             }
2469           } else { /* allcolumns */
2470             lens_i[j] = ncols;
2471           }
2472           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2473         }
2474       }
2475     }
2476 
2477     /* Create row map: global row of C -> local row of submatrices */
2478 #if defined(PETSC_USE_CTABLE)
2479     for (i = 0; i < ismax; i++) {
2480       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2481       irow_i = irow[i];
2482       jmax   = nrow[i];
2483       for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2484     }
2485 #else
2486     for (i = 0; i < ismax; i++) {
2487       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2488       rmap_i = rmap[i];
2489       irow_i = irow[i];
2490       jmax   = nrow[i];
2491       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2492     }
2493 #endif
2494 
2495     /* Update lens from offproc data */
2496     {
2497       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2498 
2499       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2500       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2501         sbuf1_i = sbuf1[pa[tmp2]];
2502         jmax    = sbuf1_i[0];
2503         ct1     = 2 * jmax + 1;
2504         ct2     = 0;
2505         rbuf2_i = rbuf2[tmp2];
2506         rbuf3_i = rbuf3[tmp2];
2507         for (j = 1; j <= jmax; j++) {
2508           is_no  = sbuf1_i[2 * j - 1];
2509           max1   = sbuf1_i[2 * j];
2510           lens_i = lens[is_no];
2511           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2512           rmap_i = rmap[is_no];
2513           for (k = 0; k < max1; k++, ct1++) {
2514 #if defined(PETSC_USE_CTABLE)
2515             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2516             row--;
2517             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2518 #else
2519             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2520 #endif
2521             max2 = rbuf2_i[ct1];
2522             for (l = 0; l < max2; l++, ct2++) {
2523               if (!allcolumns[is_no]) {
2524 #if defined(PETSC_USE_CTABLE)
2525                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2526 #else
2527                 tcol = cmap_i[rbuf3_i[ct2]];
2528 #endif
2529                 if (tcol) lens_i[row]++;
2530               } else {         /* allcolumns */
2531                 lens_i[row]++; /* lens_i[row] += max2 ? */
2532               }
2533             }
2534           }
2535         }
2536       }
2537     }
2538     PetscCall(PetscFree(r_waits3));
2539     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2540     PetscCall(PetscFree(s_waits3));
2541 
2542     /* Create the submatrices */
2543     for (i = 0; i < ismax; i++) {
2544       PetscInt rbs, cbs;
2545 
2546       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2547       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2548 
2549       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2550       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2551 
2552       PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2553       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2554       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2555 
2556       /* create struct Mat_SubSppt and attached it to submat */
2557       PetscCall(PetscNew(&smat_i));
2558       subc            = (Mat_SeqAIJ *)submats[i]->data;
2559       subc->submatis1 = smat_i;
2560 
2561       smat_i->destroy          = submats[i]->ops->destroy;
2562       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2563       submats[i]->factortype   = C->factortype;
2564 
2565       smat_i->id          = i;
2566       smat_i->nrqs        = nrqs;
2567       smat_i->nrqr        = nrqr;
2568       smat_i->rbuf1       = rbuf1;
2569       smat_i->rbuf2       = rbuf2;
2570       smat_i->rbuf3       = rbuf3;
2571       smat_i->sbuf2       = sbuf2;
2572       smat_i->req_source2 = req_source2;
2573 
2574       smat_i->sbuf1 = sbuf1;
2575       smat_i->ptr   = ptr;
2576       smat_i->tmp   = tmp;
2577       smat_i->ctr   = ctr;
2578 
2579       smat_i->pa          = pa;
2580       smat_i->req_size    = req_size;
2581       smat_i->req_source1 = req_source1;
2582 
2583       smat_i->allcolumns = allcolumns[i];
2584       smat_i->singleis   = PETSC_FALSE;
2585       smat_i->row2proc   = row2proc[i];
2586       smat_i->rmap       = rmap[i];
2587       smat_i->cmap       = cmap[i];
2588     }
2589 
2590     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2591       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2592       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2593       PetscCall(MatSetType(submats[0], MATDUMMY));
2594 
2595       /* create struct Mat_SubSppt and attached it to submat */
2596       PetscCall(PetscNew(&smat_i));
2597       submats[0]->data = (void *)smat_i;
2598 
2599       smat_i->destroy          = submats[0]->ops->destroy;
2600       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2601       submats[0]->factortype   = C->factortype;
2602 
2603       smat_i->id          = 0;
2604       smat_i->nrqs        = nrqs;
2605       smat_i->nrqr        = nrqr;
2606       smat_i->rbuf1       = rbuf1;
2607       smat_i->rbuf2       = rbuf2;
2608       smat_i->rbuf3       = rbuf3;
2609       smat_i->sbuf2       = sbuf2;
2610       smat_i->req_source2 = req_source2;
2611 
2612       smat_i->sbuf1 = sbuf1;
2613       smat_i->ptr   = ptr;
2614       smat_i->tmp   = tmp;
2615       smat_i->ctr   = ctr;
2616 
2617       smat_i->pa          = pa;
2618       smat_i->req_size    = req_size;
2619       smat_i->req_source1 = req_source1;
2620 
2621       smat_i->allcolumns = PETSC_FALSE;
2622       smat_i->singleis   = PETSC_FALSE;
2623       smat_i->row2proc   = NULL;
2624       smat_i->rmap       = NULL;
2625       smat_i->cmap       = NULL;
2626     }
2627 
2628     if (ismax) PetscCall(PetscFree(lens[0]));
2629     PetscCall(PetscFree(lens));
2630     if (sbuf_aj) {
2631       PetscCall(PetscFree(sbuf_aj[0]));
2632       PetscCall(PetscFree(sbuf_aj));
2633     }
2634 
2635   } /* endof scall == MAT_INITIAL_MATRIX */
2636 
2637   /* Post recv matrix values */
2638   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2639   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2640   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2641   for (i = 0; i < nrqs; ++i) {
2642     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2643     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2644   }
2645 
2646   /* Allocate sending buffers for a->a, and send them off */
2647   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2648   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2649   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2650   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2651 
2652   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2653   {
2654     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2655     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2656     PetscInt     cend = C->cmap->rend;
2657     PetscInt    *b_j  = b->j;
2658     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2659 
2660     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2661     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2662     for (i = 0; i < nrqr; i++) {
2663       rbuf1_i   = rbuf1[i];
2664       sbuf_aa_i = sbuf_aa[i];
2665       ct1       = 2 * rbuf1_i[0] + 1;
2666       ct2       = 0;
2667       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2668         kmax = rbuf1_i[2 * j];
2669         for (k = 0; k < kmax; k++, ct1++) {
2670           row    = rbuf1_i[ct1] - rstart;
2671           nzA    = a_i[row + 1] - a_i[row];
2672           nzB    = b_i[row + 1] - b_i[row];
2673           ncols  = nzA + nzB;
2674           cworkB = b_j + b_i[row];
2675           vworkA = a_a + a_i[row];
2676           vworkB = b_a + b_i[row];
2677 
2678           /* load the column values for this row into vals*/
2679           vals = sbuf_aa_i + ct2;
2680 
2681           lwrite = 0;
2682           for (l = 0; l < nzB; l++) {
2683             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2684           }
2685           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2686           for (l = 0; l < nzB; l++) {
2687             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2688           }
2689 
2690           ct2 += ncols;
2691         }
2692       }
2693       PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2694     }
2695     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2696     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2697   }
2698 
2699   /* Assemble the matrices */
2700   /* First assemble the local rows */
2701   for (i = 0; i < ismax; i++) {
2702     row2proc_i = row2proc[i];
2703     subc       = (Mat_SeqAIJ *)submats[i]->data;
2704     imat_ilen  = subc->ilen;
2705     imat_j     = subc->j;
2706     imat_i     = subc->i;
2707     imat_a     = subc->a;
2708 
2709     if (!allcolumns[i]) cmap_i = cmap[i];
2710     rmap_i = rmap[i];
2711     irow_i = irow[i];
2712     jmax   = nrow[i];
2713     for (j = 0; j < jmax; j++) {
2714       row  = irow_i[j];
2715       proc = row2proc_i[j];
2716       if (proc == rank) {
2717         old_row = row;
2718 #if defined(PETSC_USE_CTABLE)
2719         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2720         row--;
2721 #else
2722         row = rmap_i[row];
2723 #endif
2724         ilen_row = imat_ilen[row];
2725         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2726         mat_i = imat_i[row];
2727         mat_a = imat_a + mat_i;
2728         mat_j = imat_j + mat_i;
2729         if (!allcolumns[i]) {
2730           for (k = 0; k < ncols; k++) {
2731 #if defined(PETSC_USE_CTABLE)
2732             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2733 #else
2734             tcol = cmap_i[cols[k]];
2735 #endif
2736             if (tcol) {
2737               *mat_j++ = tcol - 1;
2738               *mat_a++ = vals[k];
2739               ilen_row++;
2740             }
2741           }
2742         } else { /* allcolumns */
2743           for (k = 0; k < ncols; k++) {
2744             *mat_j++ = cols[k]; /* global col index! */
2745             *mat_a++ = vals[k];
2746             ilen_row++;
2747           }
2748         }
2749         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2750 
2751         imat_ilen[row] = ilen_row;
2752       }
2753     }
2754   }
2755 
2756   /* Now assemble the off proc rows */
2757   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2758   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2759     sbuf1_i = sbuf1[pa[tmp2]];
2760     jmax    = sbuf1_i[0];
2761     ct1     = 2 * jmax + 1;
2762     ct2     = 0;
2763     rbuf2_i = rbuf2[tmp2];
2764     rbuf3_i = rbuf3[tmp2];
2765     rbuf4_i = rbuf4[tmp2];
2766     for (j = 1; j <= jmax; j++) {
2767       is_no  = sbuf1_i[2 * j - 1];
2768       rmap_i = rmap[is_no];
2769       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2770       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2771       imat_ilen = subc->ilen;
2772       imat_j    = subc->j;
2773       imat_i    = subc->i;
2774       imat_a    = subc->a;
2775       max1      = sbuf1_i[2 * j];
2776       for (k = 0; k < max1; k++, ct1++) {
2777         row = sbuf1_i[ct1];
2778 #if defined(PETSC_USE_CTABLE)
2779         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2780         row--;
2781 #else
2782         row = rmap_i[row];
2783 #endif
2784         ilen  = imat_ilen[row];
2785         mat_i = imat_i[row];
2786         mat_a = imat_a + mat_i;
2787         mat_j = imat_j + mat_i;
2788         max2  = rbuf2_i[ct1];
2789         if (!allcolumns[is_no]) {
2790           for (l = 0; l < max2; l++, ct2++) {
2791 #if defined(PETSC_USE_CTABLE)
2792             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2793 #else
2794             tcol = cmap_i[rbuf3_i[ct2]];
2795 #endif
2796             if (tcol) {
2797               *mat_j++ = tcol - 1;
2798               *mat_a++ = rbuf4_i[ct2];
2799               ilen++;
2800             }
2801           }
2802         } else { /* allcolumns */
2803           for (l = 0; l < max2; l++, ct2++) {
2804             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2805             *mat_a++ = rbuf4_i[ct2];
2806             ilen++;
2807           }
2808         }
2809         imat_ilen[row] = ilen;
2810       }
2811     }
2812   }
2813 
2814   if (!iscsorted) { /* sort column indices of the rows */
2815     for (i = 0; i < ismax; i++) {
2816       subc      = (Mat_SeqAIJ *)submats[i]->data;
2817       imat_j    = subc->j;
2818       imat_i    = subc->i;
2819       imat_a    = subc->a;
2820       imat_ilen = subc->ilen;
2821 
2822       if (allcolumns[i]) continue;
2823       jmax = nrow[i];
2824       for (j = 0; j < jmax; j++) {
2825         mat_i = imat_i[j];
2826         mat_a = imat_a + mat_i;
2827         mat_j = imat_j + mat_i;
2828         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2829       }
2830     }
2831   }
2832 
2833   PetscCall(PetscFree(r_waits4));
2834   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2835   PetscCall(PetscFree(s_waits4));
2836 
2837   /* Restore the indices */
2838   for (i = 0; i < ismax; i++) {
2839     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2840     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2841   }
2842 
2843   for (i = 0; i < ismax; i++) {
2844     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2845     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2846   }
2847 
2848   /* Destroy allocated memory */
2849   if (sbuf_aa) {
2850     PetscCall(PetscFree(sbuf_aa[0]));
2851     PetscCall(PetscFree(sbuf_aa));
2852   }
2853   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2854 
2855   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2856   PetscCall(PetscFree(rbuf4));
2857 
2858   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2859   PetscFunctionReturn(PETSC_SUCCESS);
2860 }
2861 
2862 /*
2863  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2864  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2865  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2866  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2867  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2868  state, and needs to be "assembled" later by compressing B's column space.
2869 
2870  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2871  Following this call, C->A & C->B have been created, even if empty.
2872  */
2873 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2874 {
2875   /* If making this function public, change the error returned in this function away from _PLIB. */
2876   Mat_MPIAIJ     *aij;
2877   Mat_SeqAIJ     *Baij;
2878   PetscBool       seqaij, Bdisassembled;
2879   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2880   PetscScalar     v;
2881   const PetscInt *rowindices, *colindices;
2882 
2883   PetscFunctionBegin;
2884   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2885   if (A) {
2886     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2887     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2888     if (rowemb) {
2889       PetscCall(ISGetLocalSize(rowemb, &m));
2890       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2891     } else {
2892       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2893     }
2894     if (dcolemb) {
2895       PetscCall(ISGetLocalSize(dcolemb, &n));
2896       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2897     } else {
2898       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2899     }
2900   }
2901   if (B) {
2902     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2903     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2904     if (rowemb) {
2905       PetscCall(ISGetLocalSize(rowemb, &m));
2906       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2907     } else {
2908       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2909     }
2910     if (ocolemb) {
2911       PetscCall(ISGetLocalSize(ocolemb, &n));
2912       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2913     } else {
2914       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2915     }
2916   }
2917 
2918   aij = (Mat_MPIAIJ *)C->data;
2919   if (!aij->A) {
2920     /* Mimic parts of MatMPIAIJSetPreallocation() */
2921     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2922     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2923     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2924     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2925   }
2926   if (A) {
2927     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2928   } else {
2929     PetscCall(MatSetUp(aij->A));
2930   }
2931   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2932     /*
2933       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2934       need to "disassemble" B -- convert it to using C's global indices.
2935       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2936 
2937       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2938       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2939 
2940       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2941       At least avoid calling MatSetValues() and the implied searches?
2942     */
2943 
2944     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2945 #if defined(PETSC_USE_CTABLE)
2946       PetscCall(PetscHMapIDestroy(&aij->colmap));
2947 #else
2948       PetscCall(PetscFree(aij->colmap));
2949       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2950 #endif
2951       ngcol = 0;
2952       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2953       if (aij->garray) { PetscCall(PetscFree(aij->garray)); }
2954       PetscCall(VecDestroy(&aij->lvec));
2955       PetscCall(VecScatterDestroy(&aij->Mvctx));
2956     }
2957     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2958     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2959   }
2960   Bdisassembled = PETSC_FALSE;
2961   if (!aij->B) {
2962     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2963     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2964     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2965     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2966     Bdisassembled = PETSC_TRUE;
2967   }
2968   if (B) {
2969     Baij = (Mat_SeqAIJ *)B->data;
2970     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2971       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2972       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2973       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2974       PetscCall(PetscFree(nz));
2975     }
2976 
2977     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2978     shift      = rend - rstart;
2979     count      = 0;
2980     rowindices = NULL;
2981     colindices = NULL;
2982     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2983     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2984     for (i = 0; i < B->rmap->n; i++) {
2985       PetscInt row;
2986       row = i;
2987       if (rowindices) row = rowindices[i];
2988       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2989         col = Baij->j[count];
2990         if (colindices) col = colindices[col];
2991         if (Bdisassembled && col >= rstart) col += shift;
2992         v = Baij->a[count];
2993         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2994         ++count;
2995       }
2996     }
2997     /* No assembly for aij->B is necessary. */
2998     /* FIXME: set aij->B's nonzerostate correctly. */
2999   } else {
3000     PetscCall(MatSetUp(aij->B));
3001   }
3002   C->preallocated  = PETSC_TRUE;
3003   C->was_assembled = PETSC_FALSE;
3004   C->assembled     = PETSC_FALSE;
3005   /*
3006       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3007       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3008    */
3009   PetscFunctionReturn(PETSC_SUCCESS);
3010 }
3011 
3012 /*
3013   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3014  */
3015 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
3016 {
3017   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
3018 
3019   PetscFunctionBegin;
3020   PetscValidPointer(A, 2);
3021   PetscValidPointer(B, 3);
3022   /* FIXME: make sure C is assembled */
3023   *A = aij->A;
3024   *B = aij->B;
3025   /* Note that we don't incref *A and *B, so be careful! */
3026   PetscFunctionReturn(PETSC_SUCCESS);
3027 }
3028 
3029 /*
3030   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3031   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3032 */
3033 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3034 {
3035   PetscMPIInt size, flag;
3036   PetscInt    i, ii, cismax, ispar;
3037   Mat        *A, *B;
3038   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3039 
3040   PetscFunctionBegin;
3041   if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);
3042 
3043   for (i = 0, cismax = 0; i < ismax; ++i) {
3044     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3045     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3046     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3047     if (size > 1) ++cismax;
3048   }
3049 
3050   /*
3051      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3052      ispar counts the number of parallel ISs across C's comm.
3053   */
3054   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3055   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3056     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3057     PetscFunctionReturn(PETSC_SUCCESS);
3058   }
3059 
3060   /* if (ispar) */
3061   /*
3062     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3063     These are used to extract the off-diag portion of the resulting parallel matrix.
3064     The row IS for the off-diag portion is the same as for the diag portion,
3065     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3066   */
3067   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3068   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3069   for (i = 0, ii = 0; i < ismax; ++i) {
3070     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3071     if (size > 1) {
3072       /*
3073          TODO: This is the part that's ***NOT SCALABLE***.
3074          To fix this we need to extract just the indices of C's nonzero columns
3075          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3076          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3077          to be done without serializing on the IS list, so, most likely, it is best
3078          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3079       */
3080       PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii])));
3081       /* Now we have to
3082          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3083              were sorted on each rank, concatenated they might no longer be sorted;
3084          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3085              indices in the nondecreasing order to the original index positions.
3086          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3087       */
3088       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3089       PetscCall(ISSort(ciscol[ii]));
3090       ++ii;
3091     }
3092   }
3093   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3094   for (i = 0, ii = 0; i < ismax; ++i) {
3095     PetscInt        j, issize;
3096     const PetscInt *indices;
3097 
3098     /*
3099        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3100      */
3101     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3102     PetscCall(ISSort(isrow[i]));
3103     PetscCall(ISGetLocalSize(isrow[i], &issize));
3104     PetscCall(ISGetIndices(isrow[i], &indices));
3105     for (j = 1; j < issize; ++j) {
3106       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3107     }
3108     PetscCall(ISRestoreIndices(isrow[i], &indices));
3109     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3110     PetscCall(ISSort(iscol[i]));
3111     PetscCall(ISGetLocalSize(iscol[i], &issize));
3112     PetscCall(ISGetIndices(iscol[i], &indices));
3113     for (j = 1; j < issize; ++j) {
3114       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3115     }
3116     PetscCall(ISRestoreIndices(iscol[i], &indices));
3117     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3118     if (size > 1) {
3119       cisrow[ii] = isrow[i];
3120       ++ii;
3121     }
3122   }
3123   /*
3124     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3125     array of sequential matrices underlying the resulting parallel matrices.
3126     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3127     contain duplicates.
3128 
3129     There are as many diag matrices as there are original index sets. There are only as many parallel
3130     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3131 
3132     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3133     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3134       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3135       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3136       with A[i] and B[ii] extracted from the corresponding MPI submat.
3137     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3138       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3139       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3140       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3141       values into A[i] and B[ii] sitting inside the corresponding submat.
3142     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3143       A[i], B[ii], AA[i] or BB[ii] matrices.
3144   */
3145   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3146   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3147 
3148   /* Now obtain the sequential A and B submatrices separately. */
3149   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3150   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3151   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3152 
3153   /*
3154     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3155     matrices A & B have been extracted directly into the parallel matrices containing them, or
3156     simply into the sequential matrix identical with the corresponding A (if size == 1).
3157     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3158     to have the same sparsity pattern.
3159     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3160     must be constructed for C. This is done by setseqmat(s).
3161   */
3162   for (i = 0, ii = 0; i < ismax; ++i) {
3163     /*
3164        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3165        That way we can avoid sorting and computing permutations when reusing.
3166        To this end:
3167         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3168         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3169           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3170     */
3171     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3172 
3173     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3174     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3175     if (size > 1) {
3176       if (scall == MAT_INITIAL_MATRIX) {
3177         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3178         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3179         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3180         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3181         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3182       }
3183       /*
3184         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3185       */
3186       {
3187         Mat AA = A[i], BB = B[ii];
3188 
3189         if (AA || BB) {
3190           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3191           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3192           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3193         }
3194         PetscCall(MatDestroy(&AA));
3195       }
3196       PetscCall(ISDestroy(ciscol + ii));
3197       PetscCall(ISDestroy(ciscol_p + ii));
3198       ++ii;
3199     } else { /* if (size == 1) */
3200       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3201       if (isrow_p[i] || iscol_p[i]) {
3202         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3203         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3204         /* Otherwise A is extracted straight into (*submats)[i]. */
3205         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3206         PetscCall(MatDestroy(A + i));
3207       } else (*submat)[i] = A[i];
3208     }
3209     PetscCall(ISDestroy(&isrow_p[i]));
3210     PetscCall(ISDestroy(&iscol_p[i]));
3211   }
3212   PetscCall(PetscFree2(cisrow, ciscol));
3213   PetscCall(PetscFree2(isrow_p, iscol_p));
3214   PetscCall(PetscFree(ciscol_p));
3215   PetscCall(PetscFree(A));
3216   PetscCall(MatDestroySubMatrices(cismax, &B));
3217   PetscFunctionReturn(PETSC_SUCCESS);
3218 }
3219 
3220 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3221 {
3222   PetscFunctionBegin;
3223   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3224   PetscFunctionReturn(PETSC_SUCCESS);
3225 }
3226