xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 73ff1848c0400416fcc176f9b48bfbf2ec9308e0)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 /*
22    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
23 
24    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25    that is a very major recoding job.
26 
27    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28 */
29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30 {
31   PetscFunctionBegin;
32   for (PetscInt i = 0; i < imax; i++) {
33     if (!is[i]) break;
34     PetscInt        n = 0, N, Nmax, Nmin;
35     const PetscInt *idx;
36     PetscInt       *nidx = NULL;
37     MPI_Comm        comm;
38     PetscBT         bt;
39 
40     PetscCall(ISGetLocalSize(is[i], &N));
41     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42       PetscCall(ISGetIndices(is[i], &idx));
43       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44       Nmin = Nmin / bs;
45       Nmax = Nmax / bs;
46       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47       for (PetscInt j = 0; j < N; j++) {
48         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49       }
50       PetscCall(PetscMalloc1(n, &nidx));
51       n = 0;
52       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53       for (PetscInt j = 0; j < N; j++) {
54         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55       }
56       PetscCall(PetscBTDestroy(&bt));
57       PetscCall(ISRestoreIndices(is[i], &idx));
58     }
59     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60     PetscCall(ISDestroy(is + i));
61     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67 {
68   PetscInt i;
69 
70   PetscFunctionBegin;
71   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74   PetscFunctionReturn(PETSC_SUCCESS);
75 }
76 
77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78 {
79   PetscInt i;
80 
81   PetscFunctionBegin;
82   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89 {
90   MPI_Comm        comm;
91   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
92   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94   const PetscInt *indices_i, **indices;
95   PetscLayout     rmap;
96   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
97   PetscSF         sf;
98   PetscSFNode    *remote;
99 
100   PetscFunctionBegin;
101   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103   PetscCallMPI(MPI_Comm_size(comm, &size));
104   /* get row map to determine where rows should be going */
105   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106   /* retrieve IS data and put all together so that we
107    * can optimize communication
108    *  */
109   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110   for (i = 0, tlength = 0; i < nidx; i++) {
111     PetscCall(ISGetLocalSize(is[i], &length[i]));
112     tlength += length[i];
113     PetscCall(ISGetIndices(is[i], &indices[i]));
114   }
115   /* find these rows on remote processors */
116   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
117   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
118   nrrows = 0;
119   for (i = 0; i < nidx; i++) {
120     length_i  = length[i];
121     indices_i = indices[i];
122     for (j = 0; j < length_i; j++) {
123       owner = -1;
124       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
125       /* remote processors */
126       if (owner != rank) {
127         tosizes_temp[owner]++;               /* number of rows to owner */
128         rrow_ranks[nrrows]   = owner;        /* processor */
129         rrow_isids[nrrows]   = i;            /* is id */
130         remoterows[nrrows++] = indices_i[j]; /* row */
131       }
132     }
133     PetscCall(ISRestoreIndices(is[i], &indices[i]));
134   }
135   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
136   /* test if we need to exchange messages
137    * generally speaking, we do not need to exchange
138    * data when overlap is 1
139    * */
140   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
141   /* we do not have any messages
142    * It usually corresponds to overlap 1
143    * */
144   if (!reducednrrows) {
145     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
146     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
147     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
148     PetscFunctionReturn(PETSC_SUCCESS);
149   }
150   nto = 0;
151   /* send sizes and ranks for building a two-sided communication */
152   for (i = 0; i < size; i++) {
153     if (tosizes_temp[i]) {
154       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
155       tosizes_temp[i]  = nto;                 /* a map from processor to index */
156       toranks[nto++]   = i;                   /* processor */
157     }
158   }
159   PetscCall(PetscMalloc1(nto + 1, &toffsets));
160   toffsets[0] = 0;
161   for (i = 0; i < nto; i++) {
162     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
163     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
164   }
165   /* send information to other processors */
166   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
167   nrecvrows = 0;
168   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
169   PetscCall(PetscMalloc1(nrecvrows, &remote));
170   nrecvrows = 0;
171   for (i = 0; i < nfrom; i++) {
172     for (j = 0; j < fromsizes[2 * i]; j++) {
173       remote[nrecvrows].rank    = fromranks[i];
174       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
175     }
176   }
177   PetscCall(PetscSFCreate(comm, &sf));
178   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
179   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
180   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
181   PetscCall(PetscSFSetFromOptions(sf));
182   /* message pair <no of is, row>  */
183   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
184   for (i = 0; i < nrrows; i++) {
185     owner                 = rrow_ranks[i];       /* processor */
186     j                     = tosizes_temp[owner]; /* index */
187     todata[toffsets[j]++] = rrow_isids[i];
188     todata[toffsets[j]++] = remoterows[i];
189   }
190   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
191   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
192   PetscCall(PetscFree(toffsets));
193   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
194   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195   PetscCall(PetscSFDestroy(&sf));
196   /* send rows belonging to the remote so that then we could get the overlapping data back */
197   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
198   PetscCall(PetscFree2(todata, fromdata));
199   PetscCall(PetscFree(fromsizes));
200   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
201   PetscCall(PetscFree(fromranks));
202   nrecvrows = 0;
203   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
204   PetscCall(PetscCalloc1(nrecvrows, &todata));
205   PetscCall(PetscMalloc1(nrecvrows, &remote));
206   nrecvrows = 0;
207   for (i = 0; i < nto; i++) {
208     for (j = 0; j < tosizes[2 * i]; j++) {
209       remote[nrecvrows].rank    = toranks[i];
210       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
211     }
212   }
213   PetscCall(PetscSFCreate(comm, &sf));
214   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
215   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
216   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
217   PetscCall(PetscSFSetFromOptions(sf));
218   /* overlap communication and computation */
219   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
220   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
221   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
222   PetscCall(PetscSFDestroy(&sf));
223   PetscCall(PetscFree2(sbdata, sbsizes));
224   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
225   PetscCall(PetscFree(toranks));
226   PetscCall(PetscFree(tosizes));
227   PetscCall(PetscFree(todata));
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
232 {
233   PetscInt       *isz, isz_i, i, j, is_id, data_size;
234   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
235   const PetscInt *indices_i_temp;
236   MPI_Comm       *iscomms;
237 
238   PetscFunctionBegin;
239   max_lsize = 0;
240   PetscCall(PetscMalloc1(nidx, &isz));
241   for (i = 0; i < nidx; i++) {
242     PetscCall(ISGetLocalSize(is[i], &lsize));
243     max_lsize = lsize > max_lsize ? lsize : max_lsize;
244     isz[i]    = lsize;
245   }
246   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
247   for (i = 0; i < nidx; i++) {
248     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
249     PetscCall(ISGetIndices(is[i], &indices_i_temp));
250     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i]));
251     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
252     PetscCall(ISDestroy(&is[i]));
253   }
254   /* retrieve information to get row id and its overlap */
255   for (i = 0; i < nrecvs;) {
256     is_id     = recvdata[i++];
257     data_size = recvdata[i++];
258     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
259     isz_i     = isz[is_id];
260     for (j = 0; j < data_size; j++) {
261       col                = recvdata[i++];
262       indices_i[isz_i++] = col;
263     }
264     isz[is_id] = isz_i;
265   }
266   /* remove duplicate entities */
267   for (i = 0; i < nidx; i++) {
268     indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i);
269     isz_i     = isz[i];
270     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
271     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
272     PetscCall(PetscCommDestroy(&iscomms[i]));
273   }
274   PetscCall(PetscFree(isz));
275   PetscCall(PetscFree2(indices_temp, iscomms));
276   PetscFunctionReturn(PETSC_SUCCESS);
277 }
278 
279 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
280 {
281   PetscLayout     rmap, cmap;
282   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
283   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
284   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
285   const PetscInt *gcols, *ai, *aj, *bi, *bj;
286   Mat             amat, bmat;
287   PetscMPIInt     rank;
288   PetscBool       done;
289   MPI_Comm        comm;
290 
291   PetscFunctionBegin;
292   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
293   PetscCallMPI(MPI_Comm_rank(comm, &rank));
294   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
295   /* Even if the mat is symmetric, we still assume it is not symmetric */
296   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
297   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
298   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
299   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
300   /* total number of nonzero values is used to estimate the memory usage in the next step */
301   tnz = ai[an] + bi[bn];
302   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
303   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
304   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
305   /* to find the longest message */
306   max_fszs = 0;
307   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
308   /* better way to estimate number of nonzero in the mat??? */
309   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
310   for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i);
311   rows_pos  = 0;
312   totalrows = 0;
313   for (i = 0; i < nfrom; i++) {
314     PetscCall(PetscArrayzero(rows_pos_i, nidx));
315     /* group data together */
316     for (j = 0; j < fromsizes[2 * i]; j += 2) {
317       is_id                       = fromrows[rows_pos++]; /* no of is */
318       rows_i                      = rows_data[is_id];
319       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
320     }
321     /* estimate a space to avoid multiple allocations  */
322     for (j = 0; j < nidx; j++) {
323       indvc_ij = 0;
324       rows_i   = rows_data[j];
325       for (l = 0; l < rows_pos_i[j]; l++) {
326         row   = rows_i[l] - rstart;
327         start = ai[row];
328         end   = ai[row + 1];
329         for (k = start; k < end; k++) { /* Amat */
330           col                     = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
332         }
333         start = bi[row];
334         end   = bi[row + 1];
335         for (k = start; k < end; k++) { /* Bmat */
336           col                     = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
341       indv_counts[i * nidx + j] = indvc_ij;
342       totalrows += indvc_ij;
343     }
344   }
345   /* message triple <no of is, number of rows, rows> */
346   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
347   totalrows = 0;
348   rows_pos  = 0;
349   /* use this code again */
350   for (i = 0; i < nfrom; i++) {
351     PetscCall(PetscArrayzero(rows_pos_i, nidx));
352     for (j = 0; j < fromsizes[2 * i]; j += 2) {
353       is_id                       = fromrows[rows_pos++];
354       rows_i                      = rows_data[is_id];
355       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
356     }
357     /* add data  */
358     for (j = 0; j < nidx; j++) {
359       if (!indv_counts[i * nidx + j]) continue;
360       indvc_ij            = 0;
361       sbdata[totalrows++] = j;
362       sbdata[totalrows++] = indv_counts[i * nidx + j];
363       sbsizes[2 * i] += 2;
364       rows_i = rows_data[j];
365       for (l = 0; l < rows_pos_i[j]; l++) {
366         row   = rows_i[l] - rstart;
367         start = ai[row];
368         end   = ai[row + 1];
369         for (k = start; k < end; k++) { /* Amat */
370           col                     = aj[k] + cstart;
371           indices_tmp[indvc_ij++] = col;
372         }
373         start = bi[row];
374         end   = bi[row + 1];
375         for (k = start; k < end; k++) { /* Bmat */
376           col                     = gcols[bj[k]];
377           indices_tmp[indvc_ij++] = col;
378         }
379       }
380       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
381       sbsizes[2 * i] += indvc_ij;
382       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
383       totalrows += indvc_ij;
384     }
385   }
386   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
387   offsets[0] = 0;
388   for (i = 0; i < nfrom; i++) {
389     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
390     sbsizes[2 * i + 1] = offsets[i];
391   }
392   PetscCall(PetscFree(offsets));
393   if (sbrowsizes) *sbrowsizes = sbsizes;
394   if (sbrows) *sbrows = sbdata;
395   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
396   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
397   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
398   PetscFunctionReturn(PETSC_SUCCESS);
399 }
400 
401 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
402 {
403   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
404   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
405   PetscInt        lsize, lsize_tmp;
406   PetscMPIInt     rank, owner;
407   Mat             amat, bmat;
408   PetscBool       done;
409   PetscLayout     cmap, rmap;
410   MPI_Comm        comm;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
414   PetscCallMPI(MPI_Comm_rank(comm, &rank));
415   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
416   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
418   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
419   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
420   /* is it a safe way to compute number of nonzero values ? */
421   tnz = ai[an] + bi[bn];
422   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
423   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
424   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
425   /* it is a better way to estimate memory than the old implementation
426    * where global size of matrix is used
427    * */
428   PetscCall(PetscMalloc1(tnz, &indices_temp));
429   for (i = 0; i < nidx; i++) {
430     MPI_Comm iscomm;
431 
432     PetscCall(ISGetLocalSize(is[i], &lsize));
433     PetscCall(ISGetIndices(is[i], &indices));
434     lsize_tmp = 0;
435     for (j = 0; j < lsize; j++) {
436       owner = -1;
437       row   = indices[j];
438       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
439       if (owner != rank) continue;
440       /* local number */
441       row -= rstart;
442       start = ai[row];
443       end   = ai[row + 1];
444       for (k = start; k < end; k++) { /* Amat */
445         col                       = aj[k] + cstart;
446         indices_temp[lsize_tmp++] = col;
447       }
448       start = bi[row];
449       end   = bi[row + 1];
450       for (k = start; k < end; k++) { /* Bmat */
451         col                       = gcols[bj[k]];
452         indices_temp[lsize_tmp++] = col;
453       }
454     }
455     PetscCall(ISRestoreIndices(is[i], &indices));
456     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
457     PetscCall(ISDestroy(&is[i]));
458     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
459     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
460     PetscCall(PetscCommDestroy(&iscomm));
461   }
462   PetscCall(PetscFree(indices_temp));
463   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
464   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
465   PetscFunctionReturn(PETSC_SUCCESS);
466 }
467 
468 /*
469   Sample message format:
470   If a processor A wants processor B to process some elements corresponding
471   to index sets is[1],is[5]
472   mesg [0] = 2   (no of index sets in the mesg)
473   -----------
474   mesg [1] = 1 => is[1]
475   mesg [2] = sizeof(is[1]);
476   -----------
477   mesg [3] = 5  => is[5]
478   mesg [4] = sizeof(is[5]);
479   -----------
480   mesg [5]
481   mesg [n]  datas[1]
482   -----------
483   mesg[n+1]
484   mesg[m]  data(is[5])
485   -----------
486 
487   Notes:
488   nrqs - no of requests sent (or to be sent out)
489   nrqr - no of requests received (which have to be or which have been processed)
490 */
491 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
492 {
493   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
494   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
495   const PetscInt **idx, *idx_i;
496   PetscInt        *n, **data, len;
497 #if defined(PETSC_USE_CTABLE)
498   PetscHMapI *table_data, table_data_i;
499   PetscInt   *tdata, tcount, tcount_max;
500 #else
501   PetscInt *data_i, *d_p;
502 #endif
503   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
504   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
505   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
506   PetscBT     *table;
507   MPI_Comm     comm;
508   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
509   MPI_Status  *recv_status;
510   MPI_Comm    *iscomms;
511   char        *t_p;
512 
513   PetscFunctionBegin;
514   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
515   size = c->size;
516   rank = c->rank;
517   M    = C->rmap->N;
518 
519   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
520   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
521 
522   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
523 
524   for (i = 0; i < imax; i++) {
525     PetscCall(ISGetIndices(is[i], &idx[i]));
526     PetscCall(ISGetLocalSize(is[i], &n[i]));
527   }
528 
529   /* evaluate communication - mesg to who,length of mesg, and buffer space
530      required. Based on this, buffers are allocated, and data copied into them  */
531   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
532   for (i = 0; i < imax; i++) {
533     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
534     idx_i = idx[i];
535     len   = n[i];
536     for (j = 0; j < len; j++) {
537       row = idx_i[j];
538       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
539       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
540       w4[proc]++;
541     }
542     for (j = 0; j < size; j++) {
543       if (w4[j]) {
544         w1[j] += w4[j];
545         w3[j]++;
546       }
547     }
548   }
549 
550   nrqs     = 0; /* no of outgoing messages */
551   msz      = 0; /* total mesg length (for all proc */
552   w1[rank] = 0; /* no mesg sent to intself */
553   w3[rank] = 0;
554   for (i = 0; i < size; i++) {
555     if (w1[i]) {
556       w2[i] = 1;
557       nrqs++;
558     } /* there exists a message to proc i */
559   }
560   /* pa - is list of processors to communicate with */
561   PetscCall(PetscMalloc1(nrqs, &pa));
562   for (i = 0, j = 0; i < size; i++) {
563     if (w1[i]) {
564       pa[j] = i;
565       j++;
566     }
567   }
568 
569   /* Each message would have a header = 1 + 2*(no of IS) + data */
570   for (i = 0; i < nrqs; i++) {
571     j = pa[i];
572     w1[j] += w2[j] + 2 * w3[j];
573     msz += w1[j];
574   }
575 
576   /* Determine the number of messages to expect, their lengths, from from-ids */
577   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
578   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
579 
580   /* Now post the Irecvs corresponding to these messages */
581   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
582 
583   /* Allocate Memory for outgoing messages */
584   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
585   PetscCall(PetscArrayzero(outdat, size));
586   PetscCall(PetscArrayzero(ptr, size));
587 
588   {
589     PetscInt *iptr = tmp, ict = 0;
590     for (i = 0; i < nrqs; i++) {
591       j = pa[i];
592       iptr += ict;
593       outdat[j] = iptr;
594       ict       = w1[j];
595     }
596   }
597 
598   /* Form the outgoing messages */
599   /* plug in the headers */
600   for (i = 0; i < nrqs; i++) {
601     j            = pa[i];
602     outdat[j][0] = 0;
603     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
604     ptr[j] = outdat[j] + 2 * w3[j] + 1;
605   }
606 
607   /* Memory for doing local proc's work */
608   {
609     PetscInt M_BPB_imax = 0;
610 #if defined(PETSC_USE_CTABLE)
611     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
612     PetscCall(PetscMalloc1(imax, &table_data));
613     for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
614     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
615     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
616 #else
617     PetscInt Mimax = 0;
618     PetscCall(PetscIntMultError(M, imax, &Mimax));
619     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
620     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
621     for (i = 0; i < imax; i++) {
622       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
623       data[i]  = d_p + M * i;
624     }
625 #endif
626   }
627 
628   /* Parse the IS and update local tables and the outgoing buf with the data */
629   {
630     PetscInt n_i, isz_i, *outdat_j, ctr_j;
631     PetscBT  table_i;
632 
633     for (i = 0; i < imax; i++) {
634       PetscCall(PetscArrayzero(ctr, size));
635       n_i     = n[i];
636       table_i = table[i];
637       idx_i   = idx[i];
638 #if defined(PETSC_USE_CTABLE)
639       table_data_i = table_data[i];
640 #else
641       data_i = data[i];
642 #endif
643       isz_i = isz[i];
644       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
645         row = idx_i[j];
646         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
647         if (proc != rank) { /* copy to the outgoing buffer */
648           ctr[proc]++;
649           *ptr[proc] = row;
650           ptr[proc]++;
651         } else if (!PetscBTLookupSet(table_i, row)) {
652 #if defined(PETSC_USE_CTABLE)
653           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
654 #else
655           data_i[isz_i] = row; /* Update the local table */
656 #endif
657           isz_i++;
658         }
659       }
660       /* Update the headers for the current IS */
661       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
662         if ((ctr_j = ctr[j])) {
663           outdat_j            = outdat[j];
664           k                   = ++outdat_j[0];
665           outdat_j[2 * k]     = ctr_j;
666           outdat_j[2 * k - 1] = i;
667         }
668       }
669       isz[i] = isz_i;
670     }
671   }
672 
673   /*  Now  post the sends */
674   PetscCall(PetscMalloc1(nrqs, &s_waits1));
675   for (i = 0; i < nrqs; ++i) {
676     j = pa[i];
677     PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
678   }
679 
680   /* No longer need the original indices */
681   PetscCall(PetscMalloc1(imax, &iscomms));
682   for (i = 0; i < imax; ++i) {
683     PetscCall(ISRestoreIndices(is[i], idx + i));
684     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
685   }
686   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
687 
688   for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
689 
690     /* Do Local work */
691 #if defined(PETSC_USE_CTABLE)
692   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
693 #else
694   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
695 #endif
696 
697   /* Receive messages */
698   PetscCall(PetscMalloc1(nrqr, &recv_status));
699   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
700   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
701 
702   /* Phase 1 sends are complete - deallocate buffers */
703   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
704   PetscCall(PetscFree4(w1, w2, w3, w4));
705 
706   PetscCall(PetscMalloc1(nrqr, &xdata));
707   PetscCall(PetscMalloc1(nrqr, &isz1));
708   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
709   PetscCall(PetscFree(rbuf[0]));
710   PetscCall(PetscFree(rbuf));
711 
712   /* Send the data back */
713   /* Do a global reduction to know the buffer space req for incoming messages */
714   {
715     PetscMPIInt *rw1;
716 
717     PetscCall(PetscCalloc1(size, &rw1));
718 
719     for (i = 0; i < nrqr; ++i) {
720       proc = recv_status[i].MPI_SOURCE;
721 
722       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
723       rw1[proc] = isz1[i];
724     }
725     PetscCall(PetscFree(onodes1));
726     PetscCall(PetscFree(olengths1));
727 
728     /* Determine the number of messages to expect, their lengths, from from-ids */
729     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
730     PetscCall(PetscFree(rw1));
731   }
732   /* Now post the Irecvs corresponding to these messages */
733   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
734 
735   /* Now  post the sends */
736   PetscCall(PetscMalloc1(nrqr, &s_waits2));
737   for (i = 0; i < nrqr; ++i) {
738     j = recv_status[i].MPI_SOURCE;
739     PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
740   }
741 
742   /* receive work done on other processors */
743   {
744     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
745     PetscMPIInt idex;
746     PetscBT     table_i;
747 
748     for (i = 0; i < nrqs; ++i) {
749       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
750       /* Process the message */
751       rbuf2_i = rbuf2[idex];
752       ct1     = 2 * rbuf2_i[0] + 1;
753       jmax    = rbuf2[idex][0];
754       for (j = 1; j <= jmax; j++) {
755         max     = rbuf2_i[2 * j];
756         is_no   = rbuf2_i[2 * j - 1];
757         isz_i   = isz[is_no];
758         table_i = table[is_no];
759 #if defined(PETSC_USE_CTABLE)
760         table_data_i = table_data[is_no];
761 #else
762         data_i = data[is_no];
763 #endif
764         for (k = 0; k < max; k++, ct1++) {
765           row = rbuf2_i[ct1];
766           if (!PetscBTLookupSet(table_i, row)) {
767 #if defined(PETSC_USE_CTABLE)
768             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
769 #else
770             data_i[isz_i] = row;
771 #endif
772             isz_i++;
773           }
774         }
775         isz[is_no] = isz_i;
776       }
777     }
778 
779     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
780   }
781 
782 #if defined(PETSC_USE_CTABLE)
783   tcount_max = 0;
784   for (i = 0; i < imax; ++i) {
785     table_data_i = table_data[i];
786     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
787     if (tcount_max < tcount) tcount_max = tcount;
788   }
789   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
790 #endif
791 
792   for (i = 0; i < imax; ++i) {
793 #if defined(PETSC_USE_CTABLE)
794     PetscHashIter tpos;
795 
796     table_data_i = table_data[i];
797     PetscHashIterBegin(table_data_i, tpos);
798     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
799       PetscHashIterGetKey(table_data_i, tpos, k);
800       PetscHashIterGetVal(table_data_i, tpos, j);
801       PetscHashIterNext(table_data_i, tpos);
802       tdata[--j] = --k;
803     }
804     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
805 #else
806     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
807 #endif
808     PetscCall(PetscCommDestroy(&iscomms[i]));
809   }
810 
811   PetscCall(PetscFree(iscomms));
812   PetscCall(PetscFree(onodes2));
813   PetscCall(PetscFree(olengths2));
814 
815   PetscCall(PetscFree(pa));
816   PetscCall(PetscFree(rbuf2[0]));
817   PetscCall(PetscFree(rbuf2));
818   PetscCall(PetscFree(s_waits1));
819   PetscCall(PetscFree(r_waits1));
820   PetscCall(PetscFree(s_waits2));
821   PetscCall(PetscFree(r_waits2));
822   PetscCall(PetscFree(recv_status));
823   if (xdata) {
824     PetscCall(PetscFree(xdata[0]));
825     PetscCall(PetscFree(xdata));
826   }
827   PetscCall(PetscFree(isz1));
828 #if defined(PETSC_USE_CTABLE)
829   for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
830   PetscCall(PetscFree(table_data));
831   PetscCall(PetscFree(tdata));
832   PetscCall(PetscFree4(table, data, isz, t_p));
833 #else
834   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
835 #endif
836   PetscFunctionReturn(PETSC_SUCCESS);
837 }
838 
839 /*
840    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
841        the work on the local processor.
842 
843      Inputs:
844       C      - MAT_MPIAIJ;
845       imax - total no of index sets processed at a time;
846       table  - an array of char - size = m bits.
847 
848      Output:
849       isz    - array containing the count of the solution elements corresponding
850                to each index set;
851       data or table_data  - pointer to the solutions
852 */
853 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
854 {
855   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
856   Mat         A = c->A, B = c->B;
857   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
858   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
859   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
860   PetscBT     table_i;
861 #if defined(PETSC_USE_CTABLE)
862   PetscHMapI    table_data_i;
863   PetscHashIter tpos;
864   PetscInt      tcount, *tdata;
865 #else
866   PetscInt *data_i;
867 #endif
868 
869   PetscFunctionBegin;
870   rstart = C->rmap->rstart;
871   cstart = C->cmap->rstart;
872   ai     = a->i;
873   aj     = a->j;
874   bi     = b->i;
875   bj     = b->j;
876   garray = c->garray;
877 
878   for (i = 0; i < imax; i++) {
879 #if defined(PETSC_USE_CTABLE)
880     /* copy existing entries of table_data_i into tdata[] */
881     table_data_i = table_data[i];
882     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
883     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
884 
885     PetscCall(PetscMalloc1(tcount, &tdata));
886     PetscHashIterBegin(table_data_i, tpos);
887     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
888       PetscHashIterGetKey(table_data_i, tpos, row);
889       PetscHashIterGetVal(table_data_i, tpos, j);
890       PetscHashIterNext(table_data_i, tpos);
891       tdata[--j] = --row;
892       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
893     }
894 #else
895     data_i = data[i];
896 #endif
897     table_i = table[i];
898     isz_i   = isz[i];
899     max     = isz[i];
900 
901     for (j = 0; j < max; j++) {
902 #if defined(PETSC_USE_CTABLE)
903       row = tdata[j] - rstart;
904 #else
905       row = data_i[j] - rstart;
906 #endif
907       start = ai[row];
908       end   = ai[row + 1];
909       for (k = start; k < end; k++) { /* Amat */
910         val = aj[k] + cstart;
911         if (!PetscBTLookupSet(table_i, val)) {
912 #if defined(PETSC_USE_CTABLE)
913           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
914 #else
915           data_i[isz_i] = val;
916 #endif
917           isz_i++;
918         }
919       }
920       start = bi[row];
921       end   = bi[row + 1];
922       for (k = start; k < end; k++) { /* Bmat */
923         val = garray[bj[k]];
924         if (!PetscBTLookupSet(table_i, val)) {
925 #if defined(PETSC_USE_CTABLE)
926           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
927 #else
928           data_i[isz_i] = val;
929 #endif
930           isz_i++;
931         }
932       }
933     }
934     isz[i] = isz_i;
935 
936 #if defined(PETSC_USE_CTABLE)
937     PetscCall(PetscFree(tdata));
938 #endif
939   }
940   PetscFunctionReturn(PETSC_SUCCESS);
941 }
942 
943 /*
944       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
945          and return the output
946 
947          Input:
948            C    - the matrix
949            nrqr - no of messages being processed.
950            rbuf - an array of pointers to the received requests
951 
952          Output:
953            xdata - array of messages to be sent back
954            isz1  - size of each message
955 
956   For better efficiency perhaps we should malloc separately each xdata[i],
957 then if a remalloc is required we need only copy the data for that one row
958 rather than all previous rows as it is now where a single large chunk of
959 memory is used.
960 
961 */
962 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
963 {
964   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
965   Mat         A = c->A, B = c->B;
966   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
967   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
968   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
969   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
970   PetscInt   *rbuf_i, kmax, rbuf_0;
971   PetscBT     xtable;
972 
973   PetscFunctionBegin;
974   m      = C->rmap->N;
975   rstart = C->rmap->rstart;
976   cstart = C->cmap->rstart;
977   ai     = a->i;
978   aj     = a->j;
979   bi     = b->i;
980   bj     = b->j;
981   garray = c->garray;
982 
983   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
984     rbuf_i = rbuf[i];
985     rbuf_0 = rbuf_i[0];
986     ct += rbuf_0;
987     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
988   }
989 
990   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
991   else max1 = 1;
992   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
993   if (nrqr) {
994     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
995     ++no_malloc;
996   }
997   PetscCall(PetscBTCreate(m, &xtable));
998   PetscCall(PetscArrayzero(isz1, nrqr));
999 
1000   ct3 = 0;
1001   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
1002     rbuf_i = rbuf[i];
1003     rbuf_0 = rbuf_i[0];
1004     ct1    = 2 * rbuf_0 + 1;
1005     ct2    = ct1;
1006     ct3 += ct1;
1007     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1008       PetscCall(PetscBTMemzero(m, xtable));
1009       oct2 = ct2;
1010       kmax = rbuf_i[2 * j];
1011       for (k = 0; k < kmax; k++, ct1++) {
1012         row = rbuf_i[ct1];
1013         if (!PetscBTLookupSet(xtable, row)) {
1014           if (!(ct3 < mem_estimate)) {
1015             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1016             PetscCall(PetscMalloc1(new_estimate, &tmp));
1017             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1018             PetscCall(PetscFree(xdata[0]));
1019             xdata[0]     = tmp;
1020             mem_estimate = new_estimate;
1021             ++no_malloc;
1022             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1023           }
1024           xdata[i][ct2++] = row;
1025           ct3++;
1026         }
1027       }
1028       for (k = oct2, max2 = ct2; k < max2; k++) {
1029         row   = xdata[i][k] - rstart;
1030         start = ai[row];
1031         end   = ai[row + 1];
1032         for (l = start; l < end; l++) {
1033           val = aj[l] + cstart;
1034           if (!PetscBTLookupSet(xtable, val)) {
1035             if (!(ct3 < mem_estimate)) {
1036               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1037               PetscCall(PetscMalloc1(new_estimate, &tmp));
1038               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1039               PetscCall(PetscFree(xdata[0]));
1040               xdata[0]     = tmp;
1041               mem_estimate = new_estimate;
1042               ++no_malloc;
1043               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1044             }
1045             xdata[i][ct2++] = val;
1046             ct3++;
1047           }
1048         }
1049         start = bi[row];
1050         end   = bi[row + 1];
1051         for (l = start; l < end; l++) {
1052           val = garray[bj[l]];
1053           if (!PetscBTLookupSet(xtable, val)) {
1054             if (!(ct3 < mem_estimate)) {
1055               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1056               PetscCall(PetscMalloc1(new_estimate, &tmp));
1057               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1058               PetscCall(PetscFree(xdata[0]));
1059               xdata[0]     = tmp;
1060               mem_estimate = new_estimate;
1061               ++no_malloc;
1062               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1063             }
1064             xdata[i][ct2++] = val;
1065             ct3++;
1066           }
1067         }
1068       }
1069       /* Update the header*/
1070       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1071       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1072     }
1073     xdata[i][0] = rbuf_0;
1074     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1075     isz1[i] = ct2; /* size of each message */
1076   }
1077   PetscCall(PetscBTDestroy(&xtable));
1078   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1079   PetscFunctionReturn(PETSC_SUCCESS);
1080 }
1081 
1082 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1083 /*
1084     Every processor gets the entire matrix
1085 */
1086 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1087 {
1088   Mat         B;
1089   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1090   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1091   PetscMPIInt size, rank;
1092   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0;
1093   PetscInt    m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1094 
1095   PetscFunctionBegin;
1096   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1097   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1098   if (scall == MAT_INITIAL_MATRIX) {
1099     /* Tell every processor the number of nonzeros per row */
1100     PetscCall(PetscCalloc1(A->rmap->N, &lens));
1101     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1102 
1103     /* All MPI processes get the same matrix */
1104     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1105 
1106     /*     Create the sequential matrix of the same type as the local block diagonal  */
1107     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1108     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1109     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1110     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1111     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1112     PetscCall(PetscCalloc1(2, Bin));
1113     **Bin = B;
1114     b     = (Mat_SeqAIJ *)B->data;
1115 
1116     /* zero column space */
1117     nrecv = 0;
1118     for (i = 0; i < size; i++) {
1119       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j];
1120     }
1121     PetscCall(PetscArrayzero(b->j, nrecv));
1122 
1123     /*   Copy my part of matrix column indices over    */
1124     sendcount  = ad->nz + bd->nz;
1125     jsendbuf   = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]);
1126     a_jsendbuf = ad->j;
1127     b_jsendbuf = bd->j;
1128     n          = A->rmap->rend - A->rmap->rstart;
1129     cnt        = 0;
1130     for (i = 0; i < n; i++) {
1131       /* put in lower diagonal portion */
1132       m = bd->i[i + 1] - bd->i[i];
1133       while (m > 0) {
1134         /* is it above diagonal (in bd (compressed) numbering) */
1135         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1136         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1137         m--;
1138       }
1139 
1140       /* put in diagonal portion */
1141       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1142 
1143       /* put in upper diagonal portion */
1144       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1145     }
1146     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1147 
1148     /* send column indices, b->j was zeroed */
1149     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1150 
1151     /*  Assemble the matrix into useable form (numerical values not yet set) */
1152     /* set the b->ilen (length of each row) values */
1153     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1154     /* set the b->i indices */
1155     b->i[0] = 0;
1156     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1157     PetscCall(PetscFree(lens));
1158     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1159     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1160 
1161   } else {
1162     B = **Bin;
1163     b = (Mat_SeqAIJ *)B->data;
1164   }
1165 
1166   /* Copy my part of matrix numerical values into the values location */
1167   if (flag == MAT_GET_VALUES) {
1168     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1169     MatScalar         *sendbuf;
1170 
1171     /* initialize b->a */
1172     PetscCall(PetscArrayzero(b->a, b->nz));
1173 
1174     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1175     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1176     sendcount = ad->nz + bd->nz;
1177     sendbuf   = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]);
1178     a_sendbuf = ada;
1179     b_sendbuf = bda;
1180     b_sendj   = bd->j;
1181     n         = A->rmap->rend - A->rmap->rstart;
1182     cnt       = 0;
1183     for (i = 0; i < n; i++) {
1184       /* put in lower diagonal portion */
1185       m = bd->i[i + 1] - bd->i[i];
1186       while (m > 0) {
1187         /* is it above diagonal (in bd (compressed) numbering) */
1188         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1189         sendbuf[cnt++] = *b_sendbuf++;
1190         m--;
1191         b_sendj++;
1192       }
1193 
1194       /* put in diagonal portion */
1195       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1196 
1197       /* put in upper diagonal portion */
1198       while (m-- > 0) {
1199         sendbuf[cnt++] = *b_sendbuf++;
1200         b_sendj++;
1201       }
1202     }
1203     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1204 
1205     /* send values, b->a was zeroed */
1206     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1207 
1208     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1209     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1210   } /* endof (flag == MAT_GET_VALUES) */
1211 
1212   PetscCall(MatPropagateSymmetryOptions(A, B));
1213   PetscFunctionReturn(PETSC_SUCCESS);
1214 }
1215 
1216 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1217 {
1218   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1219   Mat             submat, A = c->A, B = c->B;
1220   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1221   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1222   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1223   const PetscInt *icol, *irow;
1224   PetscInt        nrow, ncol, start;
1225   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1226   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1227   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1228   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1229   PetscInt       *lens, rmax, ncols, *cols, Crow;
1230 #if defined(PETSC_USE_CTABLE)
1231   PetscHMapI cmap, rmap;
1232   PetscInt  *cmap_loc, *rmap_loc;
1233 #else
1234   PetscInt *cmap, *rmap;
1235 #endif
1236   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1237   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1238   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1239   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1240   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1241   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1242   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1243   MPI_Comm      comm;
1244   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1245   PetscMPIInt  *onodes1, *olengths1, idex, end;
1246   Mat_SubSppt  *smatis1;
1247   PetscBool     isrowsorted, iscolsorted;
1248 
1249   PetscFunctionBegin;
1250   PetscValidLogicalCollectiveInt(C, ismax, 2);
1251   PetscValidLogicalCollectiveEnum(C, scall, 5);
1252   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1253   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1254   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1255   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1256   size = c->size;
1257   rank = c->rank;
1258 
1259   PetscCall(ISSorted(iscol[0], &iscolsorted));
1260   PetscCall(ISSorted(isrow[0], &isrowsorted));
1261   PetscCall(ISGetIndices(isrow[0], &irow));
1262   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1263   if (allcolumns) {
1264     icol = NULL;
1265     ncol = C->cmap->N;
1266   } else {
1267     PetscCall(ISGetIndices(iscol[0], &icol));
1268     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1269   }
1270 
1271   if (scall == MAT_INITIAL_MATRIX) {
1272     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1273 
1274     /* Get some new tags to keep the communication clean */
1275     tag1 = ((PetscObject)C)->tag;
1276     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1277     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1278 
1279     /* evaluate communication - mesg to who, length of mesg, and buffer space
1280      required. Based on this, buffers are allocated, and data copied into them */
1281     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1282     PetscCall(PetscMalloc1(nrow, &row2proc));
1283 
1284     /* w1[proc] = num of rows owned by proc -- to be requested */
1285     proc = 0;
1286     nrqs = 0; /* num of outgoing messages */
1287     for (j = 0; j < nrow; j++) {
1288       row = irow[j];
1289       if (!isrowsorted) proc = 0;
1290       while (row >= C->rmap->range[proc + 1]) proc++;
1291       w1[proc]++;
1292       row2proc[j] = proc; /* map row index to proc */
1293 
1294       if (proc != rank && !w2[proc]) {
1295         w2[proc] = 1;
1296         nrqs++;
1297       }
1298     }
1299     w1[rank] = 0; /* rows owned by self will not be requested */
1300 
1301     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1302     for (proc = 0, j = 0; proc < size; proc++) {
1303       if (w1[proc]) pa[j++] = proc;
1304     }
1305 
1306     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1307     msz = 0; /* total mesg length (for all procs) */
1308     for (i = 0; i < nrqs; i++) {
1309       proc = pa[i];
1310       w1[proc] += 3;
1311       msz += w1[proc];
1312     }
1313     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
1314 
1315     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1316     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1317     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1318 
1319     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1320        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1321     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1322 
1323     /* Now post the Irecvs corresponding to these messages */
1324     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1325 
1326     PetscCall(PetscFree(onodes1));
1327     PetscCall(PetscFree(olengths1));
1328 
1329     /* Allocate Memory for outgoing messages */
1330     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1331     PetscCall(PetscArrayzero(sbuf1, size));
1332     PetscCall(PetscArrayzero(ptr, size));
1333 
1334     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1335     iptr = tmp;
1336     for (i = 0; i < nrqs; i++) {
1337       proc        = pa[i];
1338       sbuf1[proc] = iptr;
1339       iptr += w1[proc];
1340     }
1341 
1342     /* Form the outgoing messages */
1343     /* Initialize the header space */
1344     for (i = 0; i < nrqs; i++) {
1345       proc = pa[i];
1346       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1347       ptr[proc] = sbuf1[proc] + 3;
1348     }
1349 
1350     /* Parse the isrow and copy data into outbuf */
1351     PetscCall(PetscArrayzero(ctr, size));
1352     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1353       proc = row2proc[j];
1354       if (proc != rank) { /* copy to the outgoing buf*/
1355         *ptr[proc] = irow[j];
1356         ctr[proc]++;
1357         ptr[proc]++;
1358       }
1359     }
1360 
1361     /* Update the headers for the current IS */
1362     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1363       if ((ctr_j = ctr[j])) {
1364         sbuf1_j            = sbuf1[j];
1365         k                  = ++sbuf1_j[0];
1366         sbuf1_j[2 * k]     = ctr_j;
1367         sbuf1_j[2 * k - 1] = 0;
1368       }
1369     }
1370 
1371     /* Now post the sends */
1372     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1373     for (i = 0; i < nrqs; ++i) {
1374       proc = pa[i];
1375       PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1376     }
1377 
1378     /* Post Receives to capture the buffer size */
1379     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1380     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1381 
1382     if (nrqs) rbuf2[0] = tmp + msz;
1383     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1384 
1385     for (i = 0; i < nrqs; ++i) {
1386       proc = pa[i];
1387       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1388     }
1389 
1390     PetscCall(PetscFree2(w1, w2));
1391 
1392     /* Send to other procs the buf size they should allocate */
1393     /* Receive messages*/
1394     PetscCall(PetscMalloc1(nrqr, &r_status1));
1395     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1396 
1397     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1398     for (i = 0; i < nrqr; ++i) {
1399       req_size[i] = 0;
1400       rbuf1_i     = rbuf1[i];
1401       start       = 2 * rbuf1_i[0] + 1;
1402       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1403       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1404       sbuf2_i = sbuf2[i];
1405       for (j = start; j < end; j++) {
1406         k          = rbuf1_i[j] - rstart;
1407         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1408         sbuf2_i[j] = ncols;
1409         req_size[i] += ncols;
1410       }
1411       req_source1[i] = r_status1[i].MPI_SOURCE;
1412 
1413       /* form the header */
1414       sbuf2_i[0] = req_size[i];
1415       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1416 
1417       PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1418     }
1419 
1420     PetscCall(PetscFree(r_status1));
1421     PetscCall(PetscFree(r_waits1));
1422 
1423     /* rbuf2 is received, Post recv column indices a->j */
1424     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1425 
1426     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1427     for (i = 0; i < nrqs; ++i) {
1428       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1429       req_source2[i] = r_status2[i].MPI_SOURCE;
1430       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1431     }
1432 
1433     /* Wait on sends1 and sends2 */
1434     PetscCall(PetscMalloc1(nrqs, &s_status1));
1435     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1436     PetscCall(PetscFree(s_waits1));
1437     PetscCall(PetscFree(s_status1));
1438 
1439     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1440     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1441 
1442     /* Now allocate sending buffers for a->j, and send them off */
1443     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1444     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1445     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1446     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1447 
1448     for (i = 0; i < nrqr; i++) { /* for each requested message */
1449       rbuf1_i   = rbuf1[i];
1450       sbuf_aj_i = sbuf_aj[i];
1451       ct1       = 2 * rbuf1_i[0] + 1;
1452       ct2       = 0;
1453       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1454 
1455       kmax = rbuf1[i][2];
1456       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1457         row    = rbuf1_i[ct1] - rstart;
1458         nzA    = ai[row + 1] - ai[row];
1459         nzB    = bi[row + 1] - bi[row];
1460         ncols  = nzA + nzB;
1461         cworkA = PetscSafePointerPlusOffset(aj, ai[row]);
1462         cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1463 
1464         /* load the column indices for this row into cols*/
1465         cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2);
1466 
1467         lwrite = 0;
1468         for (l = 0; l < nzB; l++) {
1469           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1470         }
1471         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1472         for (l = 0; l < nzB; l++) {
1473           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1474         }
1475 
1476         ct2 += ncols;
1477       }
1478       PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1479     }
1480 
1481     /* create column map (cmap): global col of C -> local col of submat */
1482 #if defined(PETSC_USE_CTABLE)
1483     if (!allcolumns) {
1484       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1485       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1486       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1487         if (icol[j] >= cstart && icol[j] < cend) {
1488           cmap_loc[icol[j] - cstart] = j + 1;
1489         } else { /* use PetscHMapI for non-local col indices */
1490           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1491         }
1492       }
1493     } else {
1494       cmap     = NULL;
1495       cmap_loc = NULL;
1496     }
1497     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1498 #else
1499     if (!allcolumns) {
1500       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1501       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1502     } else {
1503       cmap = NULL;
1504     }
1505 #endif
1506 
1507     /* Create lens for MatSeqAIJSetPreallocation() */
1508     PetscCall(PetscCalloc1(nrow, &lens));
1509 
1510     /* Compute lens from local part of C */
1511     for (j = 0; j < nrow; j++) {
1512       row  = irow[j];
1513       proc = row2proc[j];
1514       if (proc == rank) {
1515         /* diagonal part A = c->A */
1516         ncols = ai[row - rstart + 1] - ai[row - rstart];
1517         cols  = PetscSafePointerPlusOffset(aj, ai[row - rstart]);
1518         if (!allcolumns) {
1519           for (k = 0; k < ncols; k++) {
1520 #if defined(PETSC_USE_CTABLE)
1521             tcol = cmap_loc[cols[k]];
1522 #else
1523             tcol = cmap[cols[k] + cstart];
1524 #endif
1525             if (tcol) lens[j]++;
1526           }
1527         } else { /* allcolumns */
1528           lens[j] = ncols;
1529         }
1530 
1531         /* off-diagonal part B = c->B */
1532         ncols = bi[row - rstart + 1] - bi[row - rstart];
1533         cols  = PetscSafePointerPlusOffset(bj, bi[row - rstart]);
1534         if (!allcolumns) {
1535           for (k = 0; k < ncols; k++) {
1536 #if defined(PETSC_USE_CTABLE)
1537             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1538 #else
1539             tcol = cmap[bmap[cols[k]]];
1540 #endif
1541             if (tcol) lens[j]++;
1542           }
1543         } else { /* allcolumns */
1544           lens[j] += ncols;
1545         }
1546       }
1547     }
1548 
1549     /* Create row map (rmap): global row of C -> local row of submat */
1550 #if defined(PETSC_USE_CTABLE)
1551     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1552     for (j = 0; j < nrow; j++) {
1553       row  = irow[j];
1554       proc = row2proc[j];
1555       if (proc == rank) { /* a local row */
1556         rmap_loc[row - rstart] = j;
1557       } else {
1558         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1559       }
1560     }
1561 #else
1562     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1563     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1564 #endif
1565 
1566     /* Update lens from offproc data */
1567     /* recv a->j is done */
1568     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1569     for (i = 0; i < nrqs; i++) {
1570       proc    = pa[i];
1571       sbuf1_i = sbuf1[proc];
1572       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1573       ct1     = 2 + 1;
1574       ct2     = 0;
1575       rbuf2_i = rbuf2[i]; /* received length of C->j */
1576       rbuf3_i = rbuf3[i]; /* received C->j */
1577 
1578       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1579       max1 = sbuf1_i[2];
1580       for (k = 0; k < max1; k++, ct1++) {
1581 #if defined(PETSC_USE_CTABLE)
1582         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1583         row--;
1584         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1585 #else
1586         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1587 #endif
1588         /* Now, store row index of submat in sbuf1_i[ct1] */
1589         sbuf1_i[ct1] = row;
1590 
1591         nnz = rbuf2_i[ct1];
1592         if (!allcolumns) {
1593           for (l = 0; l < nnz; l++, ct2++) {
1594 #if defined(PETSC_USE_CTABLE)
1595             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1596               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1597             } else {
1598               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1599             }
1600 #else
1601             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1602 #endif
1603             if (tcol) lens[row]++;
1604           }
1605         } else { /* allcolumns */
1606           lens[row] += nnz;
1607         }
1608       }
1609     }
1610     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1611     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1612 
1613     /* Create the submatrices */
1614     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1615     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1616 
1617     PetscCall(ISGetBlockSize(isrow[0], &i));
1618     PetscCall(ISGetBlockSize(iscol[0], &j));
1619     if (i > 1 || j > 1) PetscCall(MatSetBlockSizes(submat, i, j));
1620     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1621     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1622 
1623     /* create struct Mat_SubSppt and attached it to submat */
1624     PetscCall(PetscNew(&smatis1));
1625     subc            = (Mat_SeqAIJ *)submat->data;
1626     subc->submatis1 = smatis1;
1627 
1628     smatis1->id          = 0;
1629     smatis1->nrqs        = nrqs;
1630     smatis1->nrqr        = nrqr;
1631     smatis1->rbuf1       = rbuf1;
1632     smatis1->rbuf2       = rbuf2;
1633     smatis1->rbuf3       = rbuf3;
1634     smatis1->sbuf2       = sbuf2;
1635     smatis1->req_source2 = req_source2;
1636 
1637     smatis1->sbuf1 = sbuf1;
1638     smatis1->ptr   = ptr;
1639     smatis1->tmp   = tmp;
1640     smatis1->ctr   = ctr;
1641 
1642     smatis1->pa          = pa;
1643     smatis1->req_size    = req_size;
1644     smatis1->req_source1 = req_source1;
1645 
1646     smatis1->allcolumns = allcolumns;
1647     smatis1->singleis   = PETSC_TRUE;
1648     smatis1->row2proc   = row2proc;
1649     smatis1->rmap       = rmap;
1650     smatis1->cmap       = cmap;
1651 #if defined(PETSC_USE_CTABLE)
1652     smatis1->rmap_loc = rmap_loc;
1653     smatis1->cmap_loc = cmap_loc;
1654 #endif
1655 
1656     smatis1->destroy     = submat->ops->destroy;
1657     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1658     submat->factortype   = C->factortype;
1659 
1660     /* compute rmax */
1661     rmax = 0;
1662     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1663 
1664   } else { /* scall == MAT_REUSE_MATRIX */
1665     submat = submats[0];
1666     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1667 
1668     subc        = (Mat_SeqAIJ *)submat->data;
1669     rmax        = subc->rmax;
1670     smatis1     = subc->submatis1;
1671     nrqs        = smatis1->nrqs;
1672     nrqr        = smatis1->nrqr;
1673     rbuf1       = smatis1->rbuf1;
1674     rbuf2       = smatis1->rbuf2;
1675     rbuf3       = smatis1->rbuf3;
1676     req_source2 = smatis1->req_source2;
1677 
1678     sbuf1 = smatis1->sbuf1;
1679     sbuf2 = smatis1->sbuf2;
1680     ptr   = smatis1->ptr;
1681     tmp   = smatis1->tmp;
1682     ctr   = smatis1->ctr;
1683 
1684     pa          = smatis1->pa;
1685     req_size    = smatis1->req_size;
1686     req_source1 = smatis1->req_source1;
1687 
1688     allcolumns = smatis1->allcolumns;
1689     row2proc   = smatis1->row2proc;
1690     rmap       = smatis1->rmap;
1691     cmap       = smatis1->cmap;
1692 #if defined(PETSC_USE_CTABLE)
1693     rmap_loc = smatis1->rmap_loc;
1694     cmap_loc = smatis1->cmap_loc;
1695 #endif
1696   }
1697 
1698   /* Post recv matrix values */
1699   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1700   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1701   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1702   for (i = 0; i < nrqs; ++i) {
1703     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1704     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1705   }
1706 
1707   /* Allocate sending buffers for a->a, and send them off */
1708   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1709   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1710   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1711   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1712 
1713   for (i = 0; i < nrqr; i++) {
1714     rbuf1_i   = rbuf1[i];
1715     sbuf_aa_i = sbuf_aa[i];
1716     ct1       = 2 * rbuf1_i[0] + 1;
1717     ct2       = 0;
1718     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1719 
1720     kmax = rbuf1_i[2];
1721     for (k = 0; k < kmax; k++, ct1++) {
1722       row    = rbuf1_i[ct1] - rstart;
1723       nzA    = ai[row + 1] - ai[row];
1724       nzB    = bi[row + 1] - bi[row];
1725       ncols  = nzA + nzB;
1726       cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1727       vworkA = PetscSafePointerPlusOffset(a_a, ai[row]);
1728       vworkB = PetscSafePointerPlusOffset(b_a, bi[row]);
1729 
1730       /* load the column values for this row into vals*/
1731       vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2);
1732 
1733       lwrite = 0;
1734       for (l = 0; l < nzB; l++) {
1735         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1736       }
1737       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1738       for (l = 0; l < nzB; l++) {
1739         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1740       }
1741 
1742       ct2 += ncols;
1743     }
1744     PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1745   }
1746 
1747   /* Assemble submat */
1748   /* First assemble the local rows */
1749   for (j = 0; j < nrow; j++) {
1750     row  = irow[j];
1751     proc = row2proc[j];
1752     if (proc == rank) {
1753       Crow = row - rstart; /* local row index of C */
1754 #if defined(PETSC_USE_CTABLE)
1755       row = rmap_loc[Crow]; /* row index of submat */
1756 #else
1757       row = rmap[row];
1758 #endif
1759 
1760       if (allcolumns) {
1761         /* diagonal part A = c->A */
1762         ncols = ai[Crow + 1] - ai[Crow];
1763         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1764         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1765         i     = 0;
1766         for (k = 0; k < ncols; k++) {
1767           subcols[i]   = cols[k] + cstart;
1768           subvals[i++] = vals[k];
1769         }
1770 
1771         /* off-diagonal part B = c->B */
1772         ncols = bi[Crow + 1] - bi[Crow];
1773         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1774         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1775         for (k = 0; k < ncols; k++) {
1776           subcols[i]   = bmap[cols[k]];
1777           subvals[i++] = vals[k];
1778         }
1779 
1780         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1781 
1782       } else { /* !allcolumns */
1783 #if defined(PETSC_USE_CTABLE)
1784         /* diagonal part A = c->A */
1785         ncols = ai[Crow + 1] - ai[Crow];
1786         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1787         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1788         i     = 0;
1789         for (k = 0; k < ncols; k++) {
1790           tcol = cmap_loc[cols[k]];
1791           if (tcol) {
1792             subcols[i]   = --tcol;
1793             subvals[i++] = vals[k];
1794           }
1795         }
1796 
1797         /* off-diagonal part B = c->B */
1798         ncols = bi[Crow + 1] - bi[Crow];
1799         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1800         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1801         for (k = 0; k < ncols; k++) {
1802           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1803           if (tcol) {
1804             subcols[i]   = --tcol;
1805             subvals[i++] = vals[k];
1806           }
1807         }
1808 #else
1809         /* diagonal part A = c->A */
1810         ncols = ai[Crow + 1] - ai[Crow];
1811         cols  = aj + ai[Crow];
1812         vals  = a_a + ai[Crow];
1813         i     = 0;
1814         for (k = 0; k < ncols; k++) {
1815           tcol = cmap[cols[k] + cstart];
1816           if (tcol) {
1817             subcols[i]   = --tcol;
1818             subvals[i++] = vals[k];
1819           }
1820         }
1821 
1822         /* off-diagonal part B = c->B */
1823         ncols = bi[Crow + 1] - bi[Crow];
1824         cols  = bj + bi[Crow];
1825         vals  = b_a + bi[Crow];
1826         for (k = 0; k < ncols; k++) {
1827           tcol = cmap[bmap[cols[k]]];
1828           if (tcol) {
1829             subcols[i]   = --tcol;
1830             subvals[i++] = vals[k];
1831           }
1832         }
1833 #endif
1834         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1835       }
1836     }
1837   }
1838 
1839   /* Now assemble the off-proc rows */
1840   for (i = 0; i < nrqs; i++) { /* for each requested message */
1841     /* recv values from other processes */
1842     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1843     proc    = pa[idex];
1844     sbuf1_i = sbuf1[proc];
1845     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1846     ct1     = 2 + 1;
1847     ct2     = 0;           /* count of received C->j */
1848     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1849     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1850     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1851     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1852 
1853     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1854     max1 = sbuf1_i[2];                  /* num of rows */
1855     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1856       row = sbuf1_i[ct1];               /* row index of submat */
1857       if (!allcolumns) {
1858         idex = 0;
1859         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1860           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1861           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1862 #if defined(PETSC_USE_CTABLE)
1863             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1864               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1865             } else {
1866               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1867             }
1868 #else
1869             tcol = cmap[rbuf3_i[ct2]];
1870 #endif
1871             if (tcol) {
1872               subcols[idex]   = --tcol; /* may not be sorted */
1873               subvals[idex++] = rbuf4_i[ct2];
1874 
1875               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1876                For reuse, we replace received C->j with index that should be inserted to submat */
1877               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1878             }
1879           }
1880           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1881         } else { /* scall == MAT_REUSE_MATRIX */
1882           submat = submats[0];
1883           subc   = (Mat_SeqAIJ *)submat->data;
1884 
1885           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1886           for (l = 0; l < nnz; l++) {
1887             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1888             subvals[idex++] = rbuf4_i[ct2];
1889           }
1890 
1891           bj = subc->j + subc->i[row]; /* sorted column indices */
1892           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1893         }
1894       } else {              /* allcolumns */
1895         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1896         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES));
1897         ct2 += nnz;
1898       }
1899     }
1900   }
1901 
1902   /* sending a->a are done */
1903   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1904   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1905 
1906   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1907   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1908   submats[0] = submat;
1909 
1910   /* Restore the indices */
1911   PetscCall(ISRestoreIndices(isrow[0], &irow));
1912   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1913 
1914   /* Destroy allocated memory */
1915   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1916   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1917   if (sbuf_aa) {
1918     PetscCall(PetscFree(sbuf_aa[0]));
1919     PetscCall(PetscFree(sbuf_aa));
1920   }
1921 
1922   if (scall == MAT_INITIAL_MATRIX) {
1923     PetscCall(PetscFree(lens));
1924     if (sbuf_aj) {
1925       PetscCall(PetscFree(sbuf_aj[0]));
1926       PetscCall(PetscFree(sbuf_aj));
1927     }
1928   }
1929   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1930   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1931   PetscFunctionReturn(PETSC_SUCCESS);
1932 }
1933 
1934 static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1935 {
1936   PetscInt  ncol;
1937   PetscBool colflag, allcolumns = PETSC_FALSE;
1938 
1939   PetscFunctionBegin;
1940   /* Allocate memory to hold all the submatrices */
1941   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1942 
1943   /* Check for special case: each processor gets entire matrix columns */
1944   PetscCall(ISIdentity(iscol[0], &colflag));
1945   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1946   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1947 
1948   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1949   PetscFunctionReturn(PETSC_SUCCESS);
1950 }
1951 
1952 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1953 {
1954   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1955   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1956   Mat_SeqAIJ  *subc;
1957   Mat_SubSppt *smat;
1958 
1959   PetscFunctionBegin;
1960   /* Check for special case: each processor has a single IS */
1961   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1962     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1963     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1964     PetscFunctionReturn(PETSC_SUCCESS);
1965   }
1966 
1967   /* Collect global wantallmatrix and nstages */
1968   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1969   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1970   if (!nmax) nmax = 1;
1971 
1972   if (scall == MAT_INITIAL_MATRIX) {
1973     /* Collect global wantallmatrix and nstages */
1974     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1975       PetscCall(ISIdentity(*isrow, &rowflag));
1976       PetscCall(ISIdentity(*iscol, &colflag));
1977       PetscCall(ISGetLocalSize(*isrow, &nrow));
1978       PetscCall(ISGetLocalSize(*iscol, &ncol));
1979       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1980         wantallmatrix = PETSC_TRUE;
1981 
1982         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1983       }
1984     }
1985 
1986     /* Determine the number of stages through which submatrices are done
1987        Each stage will extract nmax submatrices.
1988        nmax is determined by the matrix column dimension.
1989        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1990     */
1991     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1992 
1993     in[0] = -1 * (PetscInt)wantallmatrix;
1994     in[1] = nstages;
1995     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1996     wantallmatrix = (PetscBool)(-out[0]);
1997     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1998 
1999   } else { /* MAT_REUSE_MATRIX */
2000     if (ismax) {
2001       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2002       smat = subc->submatis1;
2003     } else { /* (*submat)[0] is a dummy matrix */
2004       smat = (Mat_SubSppt *)(*submat)[0]->data;
2005     }
2006     if (!smat) {
2007       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2008       wantallmatrix = PETSC_TRUE;
2009     } else if (smat->singleis) {
2010       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2011       PetscFunctionReturn(PETSC_SUCCESS);
2012     } else {
2013       nstages = smat->nstages;
2014     }
2015   }
2016 
2017   if (wantallmatrix) {
2018     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2019     PetscFunctionReturn(PETSC_SUCCESS);
2020   }
2021 
2022   /* Allocate memory to hold all the submatrices and dummy submatrices */
2023   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2024 
2025   for (i = 0, pos = 0; i < nstages; i++) {
2026     if (pos + nmax <= ismax) max_no = nmax;
2027     else if (pos >= ismax) max_no = 0;
2028     else max_no = ismax - pos;
2029 
2030     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos));
2031     if (!max_no) {
2032       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2033         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2034         smat->nstages = nstages;
2035       }
2036       pos++; /* advance to next dummy matrix if any */
2037     } else pos += max_no;
2038   }
2039 
2040   if (ismax && scall == MAT_INITIAL_MATRIX) {
2041     /* save nstages for reuse */
2042     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2043     smat          = subc->submatis1;
2044     smat->nstages = nstages;
2045   }
2046   PetscFunctionReturn(PETSC_SUCCESS);
2047 }
2048 
2049 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2050 {
2051   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2052   Mat              A = c->A;
2053   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2054   const PetscInt **icol, **irow;
2055   PetscInt        *nrow, *ncol, start;
2056   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2057   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2058   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2059   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2060   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2061 #if defined(PETSC_USE_CTABLE)
2062   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2063 #else
2064   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2065 #endif
2066   const PetscInt *irow_i;
2067   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2068   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2069   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2070   MPI_Comm        comm;
2071   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2072   PetscMPIInt    *onodes1, *olengths1, end;
2073   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2074   Mat_SubSppt    *smat_i;
2075   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2076   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2077 
2078   PetscFunctionBegin;
2079   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2080   size = c->size;
2081   rank = c->rank;
2082 
2083   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2084   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2085 
2086   for (i = 0; i < ismax; i++) {
2087     PetscCall(ISSorted(iscol[i], &issorted[i]));
2088     if (!issorted[i]) iscsorted = issorted[i];
2089 
2090     PetscCall(ISSorted(isrow[i], &issorted[i]));
2091 
2092     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2093     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2094 
2095     /* Check for special case: allcolumn */
2096     PetscCall(ISIdentity(iscol[i], &colflag));
2097     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2098     if (colflag && ncol[i] == C->cmap->N) {
2099       allcolumns[i] = PETSC_TRUE;
2100       icol[i]       = NULL;
2101     } else {
2102       allcolumns[i] = PETSC_FALSE;
2103       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2104     }
2105   }
2106 
2107   if (scall == MAT_REUSE_MATRIX) {
2108     /* Assumes new rows are same length as the old rows */
2109     for (i = 0; i < ismax; i++) {
2110       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2111       subc = (Mat_SeqAIJ *)submats[i]->data;
2112       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2113 
2114       /* Initial matrix as if empty */
2115       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2116 
2117       smat_i = subc->submatis1;
2118 
2119       nrqs        = smat_i->nrqs;
2120       nrqr        = smat_i->nrqr;
2121       rbuf1       = smat_i->rbuf1;
2122       rbuf2       = smat_i->rbuf2;
2123       rbuf3       = smat_i->rbuf3;
2124       req_source2 = smat_i->req_source2;
2125 
2126       sbuf1 = smat_i->sbuf1;
2127       sbuf2 = smat_i->sbuf2;
2128       ptr   = smat_i->ptr;
2129       tmp   = smat_i->tmp;
2130       ctr   = smat_i->ctr;
2131 
2132       pa          = smat_i->pa;
2133       req_size    = smat_i->req_size;
2134       req_source1 = smat_i->req_source1;
2135 
2136       allcolumns[i] = smat_i->allcolumns;
2137       row2proc[i]   = smat_i->row2proc;
2138       rmap[i]       = smat_i->rmap;
2139       cmap[i]       = smat_i->cmap;
2140     }
2141 
2142     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2143       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2144       smat_i = (Mat_SubSppt *)submats[0]->data;
2145 
2146       nrqs        = smat_i->nrqs;
2147       nrqr        = smat_i->nrqr;
2148       rbuf1       = smat_i->rbuf1;
2149       rbuf2       = smat_i->rbuf2;
2150       rbuf3       = smat_i->rbuf3;
2151       req_source2 = smat_i->req_source2;
2152 
2153       sbuf1 = smat_i->sbuf1;
2154       sbuf2 = smat_i->sbuf2;
2155       ptr   = smat_i->ptr;
2156       tmp   = smat_i->tmp;
2157       ctr   = smat_i->ctr;
2158 
2159       pa          = smat_i->pa;
2160       req_size    = smat_i->req_size;
2161       req_source1 = smat_i->req_source1;
2162 
2163       allcolumns[0] = PETSC_FALSE;
2164     }
2165   } else { /* scall == MAT_INITIAL_MATRIX */
2166     /* Get some new tags to keep the communication clean */
2167     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2168     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2169 
2170     /* evaluate communication - mesg to who, length of mesg, and buffer space
2171      required. Based on this, buffers are allocated, and data copied into them*/
2172     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2173 
2174     for (i = 0; i < ismax; i++) {
2175       jmax   = nrow[i];
2176       irow_i = irow[i];
2177 
2178       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2179       row2proc[i] = row2proc_i;
2180 
2181       if (issorted[i]) proc = 0;
2182       for (j = 0; j < jmax; j++) {
2183         if (!issorted[i]) proc = 0;
2184         row = irow_i[j];
2185         while (row >= C->rmap->range[proc + 1]) proc++;
2186         w4[proc]++;
2187         row2proc_i[j] = proc; /* map row index to proc */
2188       }
2189       for (j = 0; j < size; j++) {
2190         if (w4[j]) {
2191           w1[j] += w4[j];
2192           w3[j]++;
2193           w4[j] = 0;
2194         }
2195       }
2196     }
2197 
2198     nrqs     = 0; /* no of outgoing messages */
2199     msz      = 0; /* total mesg length (for all procs) */
2200     w1[rank] = 0; /* no mesg sent to self */
2201     w3[rank] = 0;
2202     for (i = 0; i < size; i++) {
2203       if (w1[i]) {
2204         w2[i] = 1;
2205         nrqs++;
2206       } /* there exists a message to proc i */
2207     }
2208     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2209     for (i = 0, j = 0; i < size; i++) {
2210       if (w1[i]) {
2211         pa[j] = i;
2212         j++;
2213       }
2214     }
2215 
2216     /* Each message would have a header = 1 + 2*(no of IS) + data */
2217     for (i = 0; i < nrqs; i++) {
2218       j = pa[i];
2219       w1[j] += w2[j] + 2 * w3[j];
2220       msz += w1[j];
2221     }
2222     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
2223 
2224     /* Determine the number of messages to expect, their lengths, from from-ids */
2225     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2226     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2227 
2228     /* Now post the Irecvs corresponding to these messages */
2229     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2230     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2231 
2232     /* Allocate Memory for outgoing messages */
2233     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2234     PetscCall(PetscArrayzero(sbuf1, size));
2235     PetscCall(PetscArrayzero(ptr, size));
2236 
2237     {
2238       PetscInt *iptr = tmp;
2239       k              = 0;
2240       for (i = 0; i < nrqs; i++) {
2241         j = pa[i];
2242         iptr += k;
2243         sbuf1[j] = iptr;
2244         k        = w1[j];
2245       }
2246     }
2247 
2248     /* Form the outgoing messages. Initialize the header space */
2249     for (i = 0; i < nrqs; i++) {
2250       j           = pa[i];
2251       sbuf1[j][0] = 0;
2252       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2253       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2254     }
2255 
2256     /* Parse the isrow and copy data into outbuf */
2257     for (i = 0; i < ismax; i++) {
2258       row2proc_i = row2proc[i];
2259       PetscCall(PetscArrayzero(ctr, size));
2260       irow_i = irow[i];
2261       jmax   = nrow[i];
2262       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2263         proc = row2proc_i[j];
2264         if (proc != rank) { /* copy to the outgoing buf*/
2265           ctr[proc]++;
2266           *ptr[proc] = irow_i[j];
2267           ptr[proc]++;
2268         }
2269       }
2270       /* Update the headers for the current IS */
2271       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2272         if ((ctr_j = ctr[j])) {
2273           sbuf1_j            = sbuf1[j];
2274           k                  = ++sbuf1_j[0];
2275           sbuf1_j[2 * k]     = ctr_j;
2276           sbuf1_j[2 * k - 1] = i;
2277         }
2278       }
2279     }
2280 
2281     /*  Now  post the sends */
2282     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2283     for (i = 0; i < nrqs; ++i) {
2284       j = pa[i];
2285       PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2286     }
2287 
2288     /* Post Receives to capture the buffer size */
2289     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2290     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2291     if (nrqs) rbuf2[0] = tmp + msz;
2292     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2293     for (i = 0; i < nrqs; ++i) {
2294       j = pa[i];
2295       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2296     }
2297 
2298     /* Send to other procs the buf size they should allocate */
2299     /* Receive messages*/
2300     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2301     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2302     {
2303       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2304       PetscInt *sbuf2_i;
2305 
2306       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2307       for (i = 0; i < nrqr; ++i) {
2308         req_size[i] = 0;
2309         rbuf1_i     = rbuf1[i];
2310         start       = 2 * rbuf1_i[0] + 1;
2311         end         = olengths1[i];
2312         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2313         sbuf2_i = sbuf2[i];
2314         for (j = start; j < end; j++) {
2315           id         = rbuf1_i[j] - rstart;
2316           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2317           sbuf2_i[j] = ncols;
2318           req_size[i] += ncols;
2319         }
2320         req_source1[i] = onodes1[i];
2321         /* form the header */
2322         sbuf2_i[0] = req_size[i];
2323         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2324 
2325         PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2326       }
2327     }
2328 
2329     PetscCall(PetscFree(onodes1));
2330     PetscCall(PetscFree(olengths1));
2331     PetscCall(PetscFree(r_waits1));
2332     PetscCall(PetscFree4(w1, w2, w3, w4));
2333 
2334     /* Receive messages*/
2335     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2336     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2337     for (i = 0; i < nrqs; ++i) {
2338       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2339       req_source2[i] = pa[i];
2340       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2341     }
2342     PetscCall(PetscFree(r_waits2));
2343 
2344     /* Wait on sends1 and sends2 */
2345     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2346     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2347     PetscCall(PetscFree(s_waits1));
2348     PetscCall(PetscFree(s_waits2));
2349 
2350     /* Now allocate sending buffers for a->j, and send them off */
2351     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2352     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2353     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2354     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2355 
2356     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2357     {
2358       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2359       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2360       PetscInt  cend = C->cmap->rend;
2361       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2362 
2363       for (i = 0; i < nrqr; i++) {
2364         rbuf1_i   = rbuf1[i];
2365         sbuf_aj_i = sbuf_aj[i];
2366         ct1       = 2 * rbuf1_i[0] + 1;
2367         ct2       = 0;
2368         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2369           kmax = rbuf1[i][2 * j];
2370           for (k = 0; k < kmax; k++, ct1++) {
2371             row    = rbuf1_i[ct1] - rstart;
2372             nzA    = a_i[row + 1] - a_i[row];
2373             nzB    = b_i[row + 1] - b_i[row];
2374             ncols  = nzA + nzB;
2375             cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]);
2376             cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2377 
2378             /* load the column indices for this row into cols */
2379             cols = sbuf_aj_i + ct2;
2380 
2381             lwrite = 0;
2382             for (l = 0; l < nzB; l++) {
2383               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2384             }
2385             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2386             for (l = 0; l < nzB; l++) {
2387               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2388             }
2389 
2390             ct2 += ncols;
2391           }
2392         }
2393         PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2394       }
2395     }
2396 
2397     /* create col map: global col of C -> local col of submatrices */
2398     {
2399       const PetscInt *icol_i;
2400 #if defined(PETSC_USE_CTABLE)
2401       for (i = 0; i < ismax; i++) {
2402         if (!allcolumns[i]) {
2403           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2404 
2405           jmax   = ncol[i];
2406           icol_i = icol[i];
2407           cmap_i = cmap[i];
2408           for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2409         } else cmap[i] = NULL;
2410       }
2411 #else
2412       for (i = 0; i < ismax; i++) {
2413         if (!allcolumns[i]) {
2414           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2415           jmax   = ncol[i];
2416           icol_i = icol[i];
2417           cmap_i = cmap[i];
2418           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2419         } else cmap[i] = NULL;
2420       }
2421 #endif
2422     }
2423 
2424     /* Create lens which is required for MatCreate... */
2425     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2426     PetscCall(PetscMalloc1(ismax, &lens));
2427 
2428     if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2429     for (i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]);
2430 
2431     /* Update lens from local data */
2432     for (i = 0; i < ismax; i++) {
2433       row2proc_i = row2proc[i];
2434       jmax       = nrow[i];
2435       if (!allcolumns[i]) cmap_i = cmap[i];
2436       irow_i = irow[i];
2437       lens_i = lens[i];
2438       for (j = 0; j < jmax; j++) {
2439         row  = irow_i[j];
2440         proc = row2proc_i[j];
2441         if (proc == rank) {
2442           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2443           if (!allcolumns[i]) {
2444             for (k = 0; k < ncols; k++) {
2445 #if defined(PETSC_USE_CTABLE)
2446               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2447 #else
2448               tcol = cmap_i[cols[k]];
2449 #endif
2450               if (tcol) lens_i[j]++;
2451             }
2452           } else { /* allcolumns */
2453             lens_i[j] = ncols;
2454           }
2455           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2456         }
2457       }
2458     }
2459 
2460     /* Create row map: global row of C -> local row of submatrices */
2461 #if defined(PETSC_USE_CTABLE)
2462     for (i = 0; i < ismax; i++) {
2463       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2464       irow_i = irow[i];
2465       jmax   = nrow[i];
2466       for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2467     }
2468 #else
2469     for (i = 0; i < ismax; i++) {
2470       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2471       rmap_i = rmap[i];
2472       irow_i = irow[i];
2473       jmax   = nrow[i];
2474       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2475     }
2476 #endif
2477 
2478     /* Update lens from offproc data */
2479     {
2480       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2481 
2482       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2483       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2484         sbuf1_i = sbuf1[pa[tmp2]];
2485         jmax    = sbuf1_i[0];
2486         ct1     = 2 * jmax + 1;
2487         ct2     = 0;
2488         rbuf2_i = rbuf2[tmp2];
2489         rbuf3_i = rbuf3[tmp2];
2490         for (j = 1; j <= jmax; j++) {
2491           is_no  = sbuf1_i[2 * j - 1];
2492           max1   = sbuf1_i[2 * j];
2493           lens_i = lens[is_no];
2494           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2495           rmap_i = rmap[is_no];
2496           for (k = 0; k < max1; k++, ct1++) {
2497 #if defined(PETSC_USE_CTABLE)
2498             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2499             row--;
2500             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2501 #else
2502             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2503 #endif
2504             max2 = rbuf2_i[ct1];
2505             for (l = 0; l < max2; l++, ct2++) {
2506               if (!allcolumns[is_no]) {
2507 #if defined(PETSC_USE_CTABLE)
2508                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2509 #else
2510                 tcol = cmap_i[rbuf3_i[ct2]];
2511 #endif
2512                 if (tcol) lens_i[row]++;
2513               } else {         /* allcolumns */
2514                 lens_i[row]++; /* lens_i[row] += max2 ? */
2515               }
2516             }
2517           }
2518         }
2519       }
2520     }
2521     PetscCall(PetscFree(r_waits3));
2522     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2523     PetscCall(PetscFree(s_waits3));
2524 
2525     /* Create the submatrices */
2526     for (i = 0; i < ismax; i++) {
2527       PetscInt rbs, cbs;
2528 
2529       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2530       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2531 
2532       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2533       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2534 
2535       if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2536       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2537       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2538 
2539       /* create struct Mat_SubSppt and attached it to submat */
2540       PetscCall(PetscNew(&smat_i));
2541       subc            = (Mat_SeqAIJ *)submats[i]->data;
2542       subc->submatis1 = smat_i;
2543 
2544       smat_i->destroy          = submats[i]->ops->destroy;
2545       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2546       submats[i]->factortype   = C->factortype;
2547 
2548       smat_i->id          = i;
2549       smat_i->nrqs        = nrqs;
2550       smat_i->nrqr        = nrqr;
2551       smat_i->rbuf1       = rbuf1;
2552       smat_i->rbuf2       = rbuf2;
2553       smat_i->rbuf3       = rbuf3;
2554       smat_i->sbuf2       = sbuf2;
2555       smat_i->req_source2 = req_source2;
2556 
2557       smat_i->sbuf1 = sbuf1;
2558       smat_i->ptr   = ptr;
2559       smat_i->tmp   = tmp;
2560       smat_i->ctr   = ctr;
2561 
2562       smat_i->pa          = pa;
2563       smat_i->req_size    = req_size;
2564       smat_i->req_source1 = req_source1;
2565 
2566       smat_i->allcolumns = allcolumns[i];
2567       smat_i->singleis   = PETSC_FALSE;
2568       smat_i->row2proc   = row2proc[i];
2569       smat_i->rmap       = rmap[i];
2570       smat_i->cmap       = cmap[i];
2571     }
2572 
2573     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2574       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2575       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2576       PetscCall(MatSetType(submats[0], MATDUMMY));
2577 
2578       /* create struct Mat_SubSppt and attached it to submat */
2579       PetscCall(PetscNew(&smat_i));
2580       submats[0]->data = (void *)smat_i;
2581 
2582       smat_i->destroy          = submats[0]->ops->destroy;
2583       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2584       submats[0]->factortype   = C->factortype;
2585 
2586       smat_i->id          = 0;
2587       smat_i->nrqs        = nrqs;
2588       smat_i->nrqr        = nrqr;
2589       smat_i->rbuf1       = rbuf1;
2590       smat_i->rbuf2       = rbuf2;
2591       smat_i->rbuf3       = rbuf3;
2592       smat_i->sbuf2       = sbuf2;
2593       smat_i->req_source2 = req_source2;
2594 
2595       smat_i->sbuf1 = sbuf1;
2596       smat_i->ptr   = ptr;
2597       smat_i->tmp   = tmp;
2598       smat_i->ctr   = ctr;
2599 
2600       smat_i->pa          = pa;
2601       smat_i->req_size    = req_size;
2602       smat_i->req_source1 = req_source1;
2603 
2604       smat_i->allcolumns = PETSC_FALSE;
2605       smat_i->singleis   = PETSC_FALSE;
2606       smat_i->row2proc   = NULL;
2607       smat_i->rmap       = NULL;
2608       smat_i->cmap       = NULL;
2609     }
2610 
2611     if (ismax) PetscCall(PetscFree(lens[0]));
2612     PetscCall(PetscFree(lens));
2613     if (sbuf_aj) {
2614       PetscCall(PetscFree(sbuf_aj[0]));
2615       PetscCall(PetscFree(sbuf_aj));
2616     }
2617 
2618   } /* endof scall == MAT_INITIAL_MATRIX */
2619 
2620   /* Post recv matrix values */
2621   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2622   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2623   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2624   for (i = 0; i < nrqs; ++i) {
2625     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2626     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2627   }
2628 
2629   /* Allocate sending buffers for a->a, and send them off */
2630   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2631   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2632   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2633   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2634 
2635   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2636   {
2637     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2638     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2639     PetscInt     cend = C->cmap->rend;
2640     PetscInt    *b_j  = b->j;
2641     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2642 
2643     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2644     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2645     for (i = 0; i < nrqr; i++) {
2646       rbuf1_i   = rbuf1[i];
2647       sbuf_aa_i = sbuf_aa[i];
2648       ct1       = 2 * rbuf1_i[0] + 1;
2649       ct2       = 0;
2650       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2651         kmax = rbuf1_i[2 * j];
2652         for (k = 0; k < kmax; k++, ct1++) {
2653           row    = rbuf1_i[ct1] - rstart;
2654           nzA    = a_i[row + 1] - a_i[row];
2655           nzB    = b_i[row + 1] - b_i[row];
2656           ncols  = nzA + nzB;
2657           cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2658           vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]);
2659           vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]);
2660 
2661           /* load the column values for this row into vals*/
2662           vals = sbuf_aa_i + ct2;
2663 
2664           lwrite = 0;
2665           for (l = 0; l < nzB; l++) {
2666             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2667           }
2668           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2669           for (l = 0; l < nzB; l++) {
2670             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2671           }
2672 
2673           ct2 += ncols;
2674         }
2675       }
2676       PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2677     }
2678     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2679     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2680   }
2681 
2682   /* Assemble the matrices */
2683   /* First assemble the local rows */
2684   for (i = 0; i < ismax; i++) {
2685     row2proc_i = row2proc[i];
2686     subc       = (Mat_SeqAIJ *)submats[i]->data;
2687     imat_ilen  = subc->ilen;
2688     imat_j     = subc->j;
2689     imat_i     = subc->i;
2690     imat_a     = subc->a;
2691 
2692     if (!allcolumns[i]) cmap_i = cmap[i];
2693     rmap_i = rmap[i];
2694     irow_i = irow[i];
2695     jmax   = nrow[i];
2696     for (j = 0; j < jmax; j++) {
2697       row  = irow_i[j];
2698       proc = row2proc_i[j];
2699       if (proc == rank) {
2700         old_row = row;
2701 #if defined(PETSC_USE_CTABLE)
2702         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2703         row--;
2704 #else
2705         row = rmap_i[row];
2706 #endif
2707         ilen_row = imat_ilen[row];
2708         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2709         mat_i = imat_i[row];
2710         mat_a = imat_a + mat_i;
2711         mat_j = imat_j + mat_i;
2712         if (!allcolumns[i]) {
2713           for (k = 0; k < ncols; k++) {
2714 #if defined(PETSC_USE_CTABLE)
2715             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2716 #else
2717             tcol = cmap_i[cols[k]];
2718 #endif
2719             if (tcol) {
2720               *mat_j++ = tcol - 1;
2721               *mat_a++ = vals[k];
2722               ilen_row++;
2723             }
2724           }
2725         } else { /* allcolumns */
2726           for (k = 0; k < ncols; k++) {
2727             *mat_j++ = cols[k]; /* global col index! */
2728             *mat_a++ = vals[k];
2729             ilen_row++;
2730           }
2731         }
2732         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2733 
2734         imat_ilen[row] = ilen_row;
2735       }
2736     }
2737   }
2738 
2739   /* Now assemble the off proc rows */
2740   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2741   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2742     sbuf1_i = sbuf1[pa[tmp2]];
2743     jmax    = sbuf1_i[0];
2744     ct1     = 2 * jmax + 1;
2745     ct2     = 0;
2746     rbuf2_i = rbuf2[tmp2];
2747     rbuf3_i = rbuf3[tmp2];
2748     rbuf4_i = rbuf4[tmp2];
2749     for (j = 1; j <= jmax; j++) {
2750       is_no  = sbuf1_i[2 * j - 1];
2751       rmap_i = rmap[is_no];
2752       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2753       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2754       imat_ilen = subc->ilen;
2755       imat_j    = subc->j;
2756       imat_i    = subc->i;
2757       imat_a    = subc->a;
2758       max1      = sbuf1_i[2 * j];
2759       for (k = 0; k < max1; k++, ct1++) {
2760         row = sbuf1_i[ct1];
2761 #if defined(PETSC_USE_CTABLE)
2762         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2763         row--;
2764 #else
2765         row = rmap_i[row];
2766 #endif
2767         ilen  = imat_ilen[row];
2768         mat_i = imat_i[row];
2769         mat_a = PetscSafePointerPlusOffset(imat_a, mat_i);
2770         mat_j = PetscSafePointerPlusOffset(imat_j, mat_i);
2771         max2  = rbuf2_i[ct1];
2772         if (!allcolumns[is_no]) {
2773           for (l = 0; l < max2; l++, ct2++) {
2774 #if defined(PETSC_USE_CTABLE)
2775             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2776 #else
2777             tcol = cmap_i[rbuf3_i[ct2]];
2778 #endif
2779             if (tcol) {
2780               *mat_j++ = tcol - 1;
2781               *mat_a++ = rbuf4_i[ct2];
2782               ilen++;
2783             }
2784           }
2785         } else { /* allcolumns */
2786           for (l = 0; l < max2; l++, ct2++) {
2787             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2788             *mat_a++ = rbuf4_i[ct2];
2789             ilen++;
2790           }
2791         }
2792         imat_ilen[row] = ilen;
2793       }
2794     }
2795   }
2796 
2797   if (!iscsorted) { /* sort column indices of the rows */
2798     for (i = 0; i < ismax; i++) {
2799       subc      = (Mat_SeqAIJ *)submats[i]->data;
2800       imat_j    = subc->j;
2801       imat_i    = subc->i;
2802       imat_a    = subc->a;
2803       imat_ilen = subc->ilen;
2804 
2805       if (allcolumns[i]) continue;
2806       jmax = nrow[i];
2807       for (j = 0; j < jmax; j++) {
2808         mat_i = imat_i[j];
2809         mat_a = imat_a + mat_i;
2810         mat_j = imat_j + mat_i;
2811         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2812       }
2813     }
2814   }
2815 
2816   PetscCall(PetscFree(r_waits4));
2817   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2818   PetscCall(PetscFree(s_waits4));
2819 
2820   /* Restore the indices */
2821   for (i = 0; i < ismax; i++) {
2822     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2823     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2824   }
2825 
2826   for (i = 0; i < ismax; i++) {
2827     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2828     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2829   }
2830 
2831   /* Destroy allocated memory */
2832   if (sbuf_aa) {
2833     PetscCall(PetscFree(sbuf_aa[0]));
2834     PetscCall(PetscFree(sbuf_aa));
2835   }
2836   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2837 
2838   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2839   PetscCall(PetscFree(rbuf4));
2840 
2841   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2842   PetscFunctionReturn(PETSC_SUCCESS);
2843 }
2844 
2845 /*
2846  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2847  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2848  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2849  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2850  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2851  state, and needs to be "assembled" later by compressing B's column space.
2852 
2853  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2854  Following this call, C->A & C->B have been created, even if empty.
2855  */
2856 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2857 {
2858   /* If making this function public, change the error returned in this function away from _PLIB. */
2859   Mat_MPIAIJ     *aij;
2860   Mat_SeqAIJ     *Baij;
2861   PetscBool       seqaij, Bdisassembled;
2862   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2863   PetscScalar     v;
2864   const PetscInt *rowindices, *colindices;
2865 
2866   PetscFunctionBegin;
2867   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2868   if (A) {
2869     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2870     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2871     if (rowemb) {
2872       PetscCall(ISGetLocalSize(rowemb, &m));
2873       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2874     } else {
2875       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2876     }
2877     if (dcolemb) {
2878       PetscCall(ISGetLocalSize(dcolemb, &n));
2879       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2880     } else {
2881       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2882     }
2883   }
2884   if (B) {
2885     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2886     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2887     if (rowemb) {
2888       PetscCall(ISGetLocalSize(rowemb, &m));
2889       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2890     } else {
2891       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2892     }
2893     if (ocolemb) {
2894       PetscCall(ISGetLocalSize(ocolemb, &n));
2895       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2896     } else {
2897       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2898     }
2899   }
2900 
2901   aij = (Mat_MPIAIJ *)C->data;
2902   if (!aij->A) {
2903     /* Mimic parts of MatMPIAIJSetPreallocation() */
2904     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2905     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2906     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2907     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2908   }
2909   if (A) {
2910     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2911   } else {
2912     PetscCall(MatSetUp(aij->A));
2913   }
2914   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2915     /*
2916       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2917       need to "disassemble" B -- convert it to using C's global indices.
2918       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2919 
2920       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2921       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2922 
2923       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2924       At least avoid calling MatSetValues() and the implied searches?
2925     */
2926 
2927     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2928 #if defined(PETSC_USE_CTABLE)
2929       PetscCall(PetscHMapIDestroy(&aij->colmap));
2930 #else
2931       PetscCall(PetscFree(aij->colmap));
2932       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2933 #endif
2934       ngcol = 0;
2935       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2936       if (aij->garray) PetscCall(PetscFree(aij->garray));
2937       PetscCall(VecDestroy(&aij->lvec));
2938       PetscCall(VecScatterDestroy(&aij->Mvctx));
2939     }
2940     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2941     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2942   }
2943   Bdisassembled = PETSC_FALSE;
2944   if (!aij->B) {
2945     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2946     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2947     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2948     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2949     Bdisassembled = PETSC_TRUE;
2950   }
2951   if (B) {
2952     Baij = (Mat_SeqAIJ *)B->data;
2953     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2954       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2955       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2956       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2957       PetscCall(PetscFree(nz));
2958     }
2959 
2960     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2961     shift      = rend - rstart;
2962     count      = 0;
2963     rowindices = NULL;
2964     colindices = NULL;
2965     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2966     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2967     for (i = 0; i < B->rmap->n; i++) {
2968       PetscInt row;
2969       row = i;
2970       if (rowindices) row = rowindices[i];
2971       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2972         col = Baij->j[count];
2973         if (colindices) col = colindices[col];
2974         if (Bdisassembled && col >= rstart) col += shift;
2975         v = Baij->a[count];
2976         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2977         ++count;
2978       }
2979     }
2980     /* No assembly for aij->B is necessary. */
2981     /* FIXME: set aij->B's nonzerostate correctly. */
2982   } else {
2983     PetscCall(MatSetUp(aij->B));
2984   }
2985   C->preallocated  = PETSC_TRUE;
2986   C->was_assembled = PETSC_FALSE;
2987   C->assembled     = PETSC_FALSE;
2988   /*
2989       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2990       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2991    */
2992   PetscFunctionReturn(PETSC_SUCCESS);
2993 }
2994 
2995 /*
2996   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2997  */
2998 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2999 {
3000   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
3001 
3002   PetscFunctionBegin;
3003   PetscAssertPointer(A, 2);
3004   PetscAssertPointer(B, 3);
3005   /* FIXME: make sure C is assembled */
3006   *A = aij->A;
3007   *B = aij->B;
3008   /* Note that we don't incref *A and *B, so be careful! */
3009   PetscFunctionReturn(PETSC_SUCCESS);
3010 }
3011 
3012 /*
3013   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3014   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3015 */
3016 static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3017 {
3018   PetscMPIInt size, flag;
3019   PetscInt    i, ii, cismax, ispar;
3020   Mat        *A, *B;
3021   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3022 
3023   PetscFunctionBegin;
3024   if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);
3025 
3026   for (i = 0, cismax = 0; i < ismax; ++i) {
3027     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3028     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3029     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3030     if (size > 1) ++cismax;
3031   }
3032 
3033   /*
3034      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3035      ispar counts the number of parallel ISs across C's comm.
3036   */
3037   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3038   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3039     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3040     PetscFunctionReturn(PETSC_SUCCESS);
3041   }
3042 
3043   /* if (ispar) */
3044   /*
3045     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3046     These are used to extract the off-diag portion of the resulting parallel matrix.
3047     The row IS for the off-diag portion is the same as for the diag portion,
3048     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3049   */
3050   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3051   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3052   for (i = 0, ii = 0; i < ismax; ++i) {
3053     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3054     if (size > 1) {
3055       /*
3056          TODO: This is the part that's ***NOT SCALABLE***.
3057          To fix this we need to extract just the indices of C's nonzero columns
3058          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3059          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3060          to be done without serializing on the IS list, so, most likely, it is best
3061          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3062       */
3063       PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii]));
3064       /* Now we have to
3065          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3066              were sorted on each rank, concatenated they might no longer be sorted;
3067          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3068              indices in the nondecreasing order to the original index positions.
3069          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3070       */
3071       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3072       PetscCall(ISSort(ciscol[ii]));
3073       ++ii;
3074     }
3075   }
3076   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3077   for (i = 0, ii = 0; i < ismax; ++i) {
3078     PetscInt        j, issize;
3079     const PetscInt *indices;
3080 
3081     /*
3082        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3083      */
3084     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3085     PetscCall(ISSort(isrow[i]));
3086     PetscCall(ISGetLocalSize(isrow[i], &issize));
3087     PetscCall(ISGetIndices(isrow[i], &indices));
3088     for (j = 1; j < issize; ++j) {
3089       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3090     }
3091     PetscCall(ISRestoreIndices(isrow[i], &indices));
3092     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3093     PetscCall(ISSort(iscol[i]));
3094     PetscCall(ISGetLocalSize(iscol[i], &issize));
3095     PetscCall(ISGetIndices(iscol[i], &indices));
3096     for (j = 1; j < issize; ++j) {
3097       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3098     }
3099     PetscCall(ISRestoreIndices(iscol[i], &indices));
3100     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3101     if (size > 1) {
3102       cisrow[ii] = isrow[i];
3103       ++ii;
3104     }
3105   }
3106   /*
3107     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3108     array of sequential matrices underlying the resulting parallel matrices.
3109     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3110     contain duplicates.
3111 
3112     There are as many diag matrices as there are original index sets. There are only as many parallel
3113     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3114 
3115     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3116     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3117       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3118       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3119       with A[i] and B[ii] extracted from the corresponding MPI submat.
3120     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3121       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3122       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3123       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3124       values into A[i] and B[ii] sitting inside the corresponding submat.
3125     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3126       A[i], B[ii], AA[i] or BB[ii] matrices.
3127   */
3128   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3129   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3130 
3131   /* Now obtain the sequential A and B submatrices separately. */
3132   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3133   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3134   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3135 
3136   /*
3137     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3138     matrices A & B have been extracted directly into the parallel matrices containing them, or
3139     simply into the sequential matrix identical with the corresponding A (if size == 1).
3140     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3141     to have the same sparsity pattern.
3142     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3143     must be constructed for C. This is done by setseqmat(s).
3144   */
3145   for (i = 0, ii = 0; i < ismax; ++i) {
3146     /*
3147        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3148        That way we can avoid sorting and computing permutations when reusing.
3149        To this end:
3150         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3151         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3152           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3153     */
3154     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3155 
3156     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3157     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3158     if (size > 1) {
3159       if (scall == MAT_INITIAL_MATRIX) {
3160         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3161         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3162         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3163         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3164         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3165       }
3166       /*
3167         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3168       */
3169       {
3170         Mat AA = A[i], BB = B[ii];
3171 
3172         if (AA || BB) {
3173           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3174           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3175           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3176         }
3177         PetscCall(MatDestroy(&AA));
3178       }
3179       PetscCall(ISDestroy(ciscol + ii));
3180       PetscCall(ISDestroy(ciscol_p + ii));
3181       ++ii;
3182     } else { /* if (size == 1) */
3183       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3184       if (isrow_p[i] || iscol_p[i]) {
3185         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3186         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3187         /* Otherwise A is extracted straight into (*submats)[i]. */
3188         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3189         PetscCall(MatDestroy(A + i));
3190       } else (*submat)[i] = A[i];
3191     }
3192     PetscCall(ISDestroy(&isrow_p[i]));
3193     PetscCall(ISDestroy(&iscol_p[i]));
3194   }
3195   PetscCall(PetscFree2(cisrow, ciscol));
3196   PetscCall(PetscFree2(isrow_p, iscol_p));
3197   PetscCall(PetscFree(ciscol_p));
3198   PetscCall(PetscFree(A));
3199   PetscCall(MatDestroySubMatrices(cismax, &B));
3200   PetscFunctionReturn(PETSC_SUCCESS);
3201 }
3202 
3203 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3204 {
3205   PetscFunctionBegin;
3206   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3207   PetscFunctionReturn(PETSC_SUCCESS);
3208 }
3209