xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 35cb6cd333087cc89d8d5031932d4f38af02614d)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 /*
22    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
23 
24    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25    that is a very major recoding job.
26 
27    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28 */
29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30 {
31   PetscFunctionBegin;
32   for (PetscInt i = 0; i < imax; i++) {
33     if (!is[i]) break;
34     PetscInt        n = 0, N, Nmax, Nmin;
35     const PetscInt *idx;
36     PetscInt       *nidx = NULL;
37     MPI_Comm        comm;
38     PetscBT         bt;
39 
40     PetscCall(ISGetLocalSize(is[i], &N));
41     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42       PetscCall(ISGetIndices(is[i], &idx));
43       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44       Nmin = Nmin / bs;
45       Nmax = Nmax / bs;
46       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47       for (PetscInt j = 0; j < N; j++) {
48         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49       }
50       PetscCall(PetscMalloc1(n, &nidx));
51       n = 0;
52       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53       for (PetscInt j = 0; j < N; j++) {
54         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55       }
56       PetscCall(PetscBTDestroy(&bt));
57       PetscCall(ISRestoreIndices(is[i], &idx));
58     }
59     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60     PetscCall(ISDestroy(is + i));
61     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62   }
63   PetscFunctionReturn(0);
64 }
65 
66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67 {
68   PetscInt i;
69 
70   PetscFunctionBegin;
71   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78 {
79   PetscInt i;
80 
81   PetscFunctionBegin;
82   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85   PetscFunctionReturn(0);
86 }
87 
88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89 {
90   MPI_Comm        comm;
91   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
92   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94   const PetscInt *indices_i, **indices;
95   PetscLayout     rmap;
96   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
97   PetscSF         sf;
98   PetscSFNode    *remote;
99 
100   PetscFunctionBegin;
101   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103   PetscCallMPI(MPI_Comm_size(comm, &size));
104   /* get row map to determine where rows should be going */
105   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106   /* retrieve IS data and put all together so that we
107    * can optimize communication
108    *  */
109   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110   for (i = 0, tlength = 0; i < nidx; i++) {
111     PetscCall(ISGetLocalSize(is[i], &length[i]));
112     tlength += length[i];
113     PetscCall(ISGetIndices(is[i], &indices[i]));
114   }
115   /* find these rows on remote processors */
116   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
117   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
118   nrrows = 0;
119   for (i = 0; i < nidx; i++) {
120     length_i  = length[i];
121     indices_i = indices[i];
122     for (j = 0; j < length_i; j++) {
123       owner = -1;
124       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
125       /* remote processors */
126       if (owner != rank) {
127         tosizes_temp[owner]++;               /* number of rows to owner */
128         rrow_ranks[nrrows]   = owner;        /* processor */
129         rrow_isids[nrrows]   = i;            /* is id */
130         remoterows[nrrows++] = indices_i[j]; /* row */
131       }
132     }
133     PetscCall(ISRestoreIndices(is[i], &indices[i]));
134   }
135   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
136   /* test if we need to exchange messages
137    * generally speaking, we do not need to exchange
138    * data when overlap is 1
139    * */
140   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
141   /* we do not have any messages
142    * It usually corresponds to overlap 1
143    * */
144   if (!reducednrrows) {
145     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
146     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
147     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
148     PetscFunctionReturn(0);
149   }
150   nto = 0;
151   /* send sizes and ranks for building a two-sided communcation */
152   for (i = 0; i < size; i++) {
153     if (tosizes_temp[i]) {
154       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
155       tosizes_temp[i]  = nto;                 /* a map from processor to index */
156       toranks[nto++]   = i;                   /* processor */
157     }
158   }
159   PetscCall(PetscMalloc1(nto + 1, &toffsets));
160   toffsets[0] = 0;
161   for (i = 0; i < nto; i++) {
162     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
163     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
164   }
165   /* send information to other processors */
166   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
167   nrecvrows = 0;
168   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
169   PetscCall(PetscMalloc1(nrecvrows, &remote));
170   nrecvrows = 0;
171   for (i = 0; i < nfrom; i++) {
172     for (j = 0; j < fromsizes[2 * i]; j++) {
173       remote[nrecvrows].rank    = fromranks[i];
174       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
175     }
176   }
177   PetscCall(PetscSFCreate(comm, &sf));
178   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
179   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
180   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
181   PetscCall(PetscSFSetFromOptions(sf));
182   /* message pair <no of is, row>  */
183   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
184   for (i = 0; i < nrrows; i++) {
185     owner                 = rrow_ranks[i];       /* processor */
186     j                     = tosizes_temp[owner]; /* index */
187     todata[toffsets[j]++] = rrow_isids[i];
188     todata[toffsets[j]++] = remoterows[i];
189   }
190   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
191   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
192   PetscCall(PetscFree(toffsets));
193   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
194   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195   PetscCall(PetscSFDestroy(&sf));
196   /* send rows belonging to the remote so that then we could get the overlapping data back */
197   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
198   PetscCall(PetscFree2(todata, fromdata));
199   PetscCall(PetscFree(fromsizes));
200   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
201   PetscCall(PetscFree(fromranks));
202   nrecvrows = 0;
203   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
204   PetscCall(PetscCalloc1(nrecvrows, &todata));
205   PetscCall(PetscMalloc1(nrecvrows, &remote));
206   nrecvrows = 0;
207   for (i = 0; i < nto; i++) {
208     for (j = 0; j < tosizes[2 * i]; j++) {
209       remote[nrecvrows].rank    = toranks[i];
210       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
211     }
212   }
213   PetscCall(PetscSFCreate(comm, &sf));
214   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
215   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
216   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
217   PetscCall(PetscSFSetFromOptions(sf));
218   /* overlap communication and computation */
219   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
220   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
221   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
222   PetscCall(PetscSFDestroy(&sf));
223   PetscCall(PetscFree2(sbdata, sbsizes));
224   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
225   PetscCall(PetscFree(toranks));
226   PetscCall(PetscFree(tosizes));
227   PetscCall(PetscFree(todata));
228   PetscFunctionReturn(0);
229 }
230 
231 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
232 {
233   PetscInt       *isz, isz_i, i, j, is_id, data_size;
234   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
235   const PetscInt *indices_i_temp;
236   MPI_Comm       *iscomms;
237 
238   PetscFunctionBegin;
239   max_lsize = 0;
240   PetscCall(PetscMalloc1(nidx, &isz));
241   for (i = 0; i < nidx; i++) {
242     PetscCall(ISGetLocalSize(is[i], &lsize));
243     max_lsize = lsize > max_lsize ? lsize : max_lsize;
244     isz[i]    = lsize;
245   }
246   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
247   for (i = 0; i < nidx; i++) {
248     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
249     PetscCall(ISGetIndices(is[i], &indices_i_temp));
250     PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]));
251     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
252     PetscCall(ISDestroy(&is[i]));
253   }
254   /* retrieve information to get row id and its overlap */
255   for (i = 0; i < nrecvs;) {
256     is_id     = recvdata[i++];
257     data_size = recvdata[i++];
258     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
259     isz_i     = isz[is_id];
260     for (j = 0; j < data_size; j++) {
261       col                = recvdata[i++];
262       indices_i[isz_i++] = col;
263     }
264     isz[is_id] = isz_i;
265   }
266   /* remove duplicate entities */
267   for (i = 0; i < nidx; i++) {
268     indices_i = indices_temp + (max_lsize + nrecvs) * i;
269     isz_i     = isz[i];
270     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
271     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
272     PetscCall(PetscCommDestroy(&iscomms[i]));
273   }
274   PetscCall(PetscFree(isz));
275   PetscCall(PetscFree2(indices_temp, iscomms));
276   PetscFunctionReturn(0);
277 }
278 
279 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
280 {
281   PetscLayout     rmap, cmap;
282   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
283   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
284   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
285   const PetscInt *gcols, *ai, *aj, *bi, *bj;
286   Mat             amat, bmat;
287   PetscMPIInt     rank;
288   PetscBool       done;
289   MPI_Comm        comm;
290 
291   PetscFunctionBegin;
292   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
293   PetscCallMPI(MPI_Comm_rank(comm, &rank));
294   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
295   /* Even if the mat is symmetric, we still assume it is not symmetric */
296   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
297   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
298   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
299   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
300   /* total number of nonzero values is used to estimate the memory usage in the next step */
301   tnz = ai[an] + bi[bn];
302   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
303   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
304   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
305   /* to find the longest message */
306   max_fszs = 0;
307   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
308   /* better way to estimate number of nonzero in the mat??? */
309   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
310   for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
311   rows_pos  = 0;
312   totalrows = 0;
313   for (i = 0; i < nfrom; i++) {
314     PetscCall(PetscArrayzero(rows_pos_i, nidx));
315     /* group data together */
316     for (j = 0; j < fromsizes[2 * i]; j += 2) {
317       is_id                       = fromrows[rows_pos++]; /* no of is */
318       rows_i                      = rows_data[is_id];
319       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
320     }
321     /* estimate a space to avoid multiple allocations  */
322     for (j = 0; j < nidx; j++) {
323       indvc_ij = 0;
324       rows_i   = rows_data[j];
325       for (l = 0; l < rows_pos_i[j]; l++) {
326         row   = rows_i[l] - rstart;
327         start = ai[row];
328         end   = ai[row + 1];
329         for (k = start; k < end; k++) { /* Amat */
330           col                     = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
332         }
333         start = bi[row];
334         end   = bi[row + 1];
335         for (k = start; k < end; k++) { /* Bmat */
336           col                     = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
341       indv_counts[i * nidx + j] = indvc_ij;
342       totalrows += indvc_ij;
343     }
344   }
345   /* message triple <no of is, number of rows, rows> */
346   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
347   totalrows = 0;
348   rows_pos  = 0;
349   /* use this code again */
350   for (i = 0; i < nfrom; i++) {
351     PetscCall(PetscArrayzero(rows_pos_i, nidx));
352     for (j = 0; j < fromsizes[2 * i]; j += 2) {
353       is_id                       = fromrows[rows_pos++];
354       rows_i                      = rows_data[is_id];
355       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
356     }
357     /* add data  */
358     for (j = 0; j < nidx; j++) {
359       if (!indv_counts[i * nidx + j]) continue;
360       indvc_ij            = 0;
361       sbdata[totalrows++] = j;
362       sbdata[totalrows++] = indv_counts[i * nidx + j];
363       sbsizes[2 * i] += 2;
364       rows_i = rows_data[j];
365       for (l = 0; l < rows_pos_i[j]; l++) {
366         row   = rows_i[l] - rstart;
367         start = ai[row];
368         end   = ai[row + 1];
369         for (k = start; k < end; k++) { /* Amat */
370           col                     = aj[k] + cstart;
371           indices_tmp[indvc_ij++] = col;
372         }
373         start = bi[row];
374         end   = bi[row + 1];
375         for (k = start; k < end; k++) { /* Bmat */
376           col                     = gcols[bj[k]];
377           indices_tmp[indvc_ij++] = col;
378         }
379       }
380       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
381       sbsizes[2 * i] += indvc_ij;
382       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
383       totalrows += indvc_ij;
384     }
385   }
386   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
387   offsets[0] = 0;
388   for (i = 0; i < nfrom; i++) {
389     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
390     sbsizes[2 * i + 1] = offsets[i];
391   }
392   PetscCall(PetscFree(offsets));
393   if (sbrowsizes) *sbrowsizes = sbsizes;
394   if (sbrows) *sbrows = sbdata;
395   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
396   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
397   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
398   PetscFunctionReturn(0);
399 }
400 
401 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
402 {
403   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
404   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
405   PetscInt        lsize, lsize_tmp;
406   PetscMPIInt     rank, owner;
407   Mat             amat, bmat;
408   PetscBool       done;
409   PetscLayout     cmap, rmap;
410   MPI_Comm        comm;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
414   PetscCallMPI(MPI_Comm_rank(comm, &rank));
415   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
416   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
418   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
419   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
420   /* is it a safe way to compute number of nonzero values ? */
421   tnz = ai[an] + bi[bn];
422   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
423   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
424   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
425   /* it is a better way to estimate memory than the old implementation
426    * where global size of matrix is used
427    * */
428   PetscCall(PetscMalloc1(tnz, &indices_temp));
429   for (i = 0; i < nidx; i++) {
430     MPI_Comm iscomm;
431 
432     PetscCall(ISGetLocalSize(is[i], &lsize));
433     PetscCall(ISGetIndices(is[i], &indices));
434     lsize_tmp = 0;
435     for (j = 0; j < lsize; j++) {
436       owner = -1;
437       row   = indices[j];
438       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
439       if (owner != rank) continue;
440       /* local number */
441       row -= rstart;
442       start = ai[row];
443       end   = ai[row + 1];
444       for (k = start; k < end; k++) { /* Amat */
445         col                       = aj[k] + cstart;
446         indices_temp[lsize_tmp++] = col;
447       }
448       start = bi[row];
449       end   = bi[row + 1];
450       for (k = start; k < end; k++) { /* Bmat */
451         col                       = gcols[bj[k]];
452         indices_temp[lsize_tmp++] = col;
453       }
454     }
455     PetscCall(ISRestoreIndices(is[i], &indices));
456     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
457     PetscCall(ISDestroy(&is[i]));
458     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
459     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
460     PetscCall(PetscCommDestroy(&iscomm));
461   }
462   PetscCall(PetscFree(indices_temp));
463   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
464   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
465   PetscFunctionReturn(0);
466 }
467 
468 /*
469   Sample message format:
470   If a processor A wants processor B to process some elements corresponding
471   to index sets is[1],is[5]
472   mesg [0] = 2   (no of index sets in the mesg)
473   -----------
474   mesg [1] = 1 => is[1]
475   mesg [2] = sizeof(is[1]);
476   -----------
477   mesg [3] = 5  => is[5]
478   mesg [4] = sizeof(is[5]);
479   -----------
480   mesg [5]
481   mesg [n]  datas[1]
482   -----------
483   mesg[n+1]
484   mesg[m]  data(is[5])
485   -----------
486 
487   Notes:
488   nrqs - no of requests sent (or to be sent out)
489   nrqr - no of requests received (which have to be or which have been processed)
490 */
491 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
492 {
493   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
494   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
495   const PetscInt **idx, *idx_i;
496   PetscInt        *n, **data, len;
497 #if defined(PETSC_USE_CTABLE)
498   PetscTable *table_data, table_data_i;
499   PetscInt   *tdata, tcount, tcount_max;
500 #else
501   PetscInt *data_i, *d_p;
502 #endif
503   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
504   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
505   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
506   PetscBT     *table;
507   MPI_Comm     comm;
508   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
509   MPI_Status  *recv_status;
510   MPI_Comm    *iscomms;
511   char        *t_p;
512 
513   PetscFunctionBegin;
514   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
515   size = c->size;
516   rank = c->rank;
517   M    = C->rmap->N;
518 
519   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
520   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
521 
522   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
523 
524   for (i = 0; i < imax; i++) {
525     PetscCall(ISGetIndices(is[i], &idx[i]));
526     PetscCall(ISGetLocalSize(is[i], &n[i]));
527   }
528 
529   /* evaluate communication - mesg to who,length of mesg, and buffer space
530      required. Based on this, buffers are allocated, and data copied into them  */
531   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
532   for (i = 0; i < imax; i++) {
533     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
534     idx_i = idx[i];
535     len   = n[i];
536     for (j = 0; j < len; j++) {
537       row = idx_i[j];
538       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
539       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
540       w4[proc]++;
541     }
542     for (j = 0; j < size; j++) {
543       if (w4[j]) {
544         w1[j] += w4[j];
545         w3[j]++;
546       }
547     }
548   }
549 
550   nrqs     = 0; /* no of outgoing messages */
551   msz      = 0; /* total mesg length (for all proc */
552   w1[rank] = 0; /* no mesg sent to intself */
553   w3[rank] = 0;
554   for (i = 0; i < size; i++) {
555     if (w1[i]) {
556       w2[i] = 1;
557       nrqs++;
558     } /* there exists a message to proc i */
559   }
560   /* pa - is list of processors to communicate with */
561   PetscCall(PetscMalloc1(nrqs, &pa));
562   for (i = 0, j = 0; i < size; i++) {
563     if (w1[i]) {
564       pa[j] = i;
565       j++;
566     }
567   }
568 
569   /* Each message would have a header = 1 + 2*(no of IS) + data */
570   for (i = 0; i < nrqs; i++) {
571     j = pa[i];
572     w1[j] += w2[j] + 2 * w3[j];
573     msz += w1[j];
574   }
575 
576   /* Determine the number of messages to expect, their lengths, from from-ids */
577   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
578   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
579 
580   /* Now post the Irecvs corresponding to these messages */
581   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
582 
583   /* Allocate Memory for outgoing messages */
584   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
585   PetscCall(PetscArrayzero(outdat, size));
586   PetscCall(PetscArrayzero(ptr, size));
587 
588   {
589     PetscInt *iptr = tmp, ict = 0;
590     for (i = 0; i < nrqs; i++) {
591       j = pa[i];
592       iptr += ict;
593       outdat[j] = iptr;
594       ict       = w1[j];
595     }
596   }
597 
598   /* Form the outgoing messages */
599   /* plug in the headers */
600   for (i = 0; i < nrqs; i++) {
601     j            = pa[i];
602     outdat[j][0] = 0;
603     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
604     ptr[j] = outdat[j] + 2 * w3[j] + 1;
605   }
606 
607   /* Memory for doing local proc's work */
608   {
609     PetscInt M_BPB_imax = 0;
610 #if defined(PETSC_USE_CTABLE)
611     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
612     PetscCall(PetscMalloc1(imax, &table_data));
613     for (i = 0; i < imax; i++) PetscCall(PetscTableCreate(n[i], M, &table_data[i]));
614     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
615     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
616 #else
617     PetscInt Mimax = 0;
618     PetscCall(PetscIntMultError(M, imax, &Mimax));
619     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
620     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
621     for (i = 0; i < imax; i++) {
622       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
623       data[i]  = d_p + M * i;
624     }
625 #endif
626   }
627 
628   /* Parse the IS and update local tables and the outgoing buf with the data */
629   {
630     PetscInt n_i, isz_i, *outdat_j, ctr_j;
631     PetscBT  table_i;
632 
633     for (i = 0; i < imax; i++) {
634       PetscCall(PetscArrayzero(ctr, size));
635       n_i     = n[i];
636       table_i = table[i];
637       idx_i   = idx[i];
638 #if defined(PETSC_USE_CTABLE)
639       table_data_i = table_data[i];
640 #else
641       data_i = data[i];
642 #endif
643       isz_i = isz[i];
644       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
645         row = idx_i[j];
646         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
647         if (proc != rank) { /* copy to the outgoing buffer */
648           ctr[proc]++;
649           *ptr[proc] = row;
650           ptr[proc]++;
651         } else if (!PetscBTLookupSet(table_i, row)) {
652 #if defined(PETSC_USE_CTABLE)
653           PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES));
654 #else
655           data_i[isz_i] = row; /* Update the local table */
656 #endif
657           isz_i++;
658         }
659       }
660       /* Update the headers for the current IS */
661       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
662         if ((ctr_j = ctr[j])) {
663           outdat_j            = outdat[j];
664           k                   = ++outdat_j[0];
665           outdat_j[2 * k]     = ctr_j;
666           outdat_j[2 * k - 1] = i;
667         }
668       }
669       isz[i] = isz_i;
670     }
671   }
672 
673   /*  Now  post the sends */
674   PetscCall(PetscMalloc1(nrqs, &s_waits1));
675   for (i = 0; i < nrqs; ++i) {
676     j = pa[i];
677     PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
678   }
679 
680   /* No longer need the original indices */
681   PetscCall(PetscMalloc1(imax, &iscomms));
682   for (i = 0; i < imax; ++i) {
683     PetscCall(ISRestoreIndices(is[i], idx + i));
684     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
685   }
686   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
687 
688   for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
689 
690     /* Do Local work */
691 #if defined(PETSC_USE_CTABLE)
692   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
693 #else
694   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
695 #endif
696 
697   /* Receive messages */
698   PetscCall(PetscMalloc1(nrqr, &recv_status));
699   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
700   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
701 
702   /* Phase 1 sends are complete - deallocate buffers */
703   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
704   PetscCall(PetscFree4(w1, w2, w3, w4));
705 
706   PetscCall(PetscMalloc1(nrqr, &xdata));
707   PetscCall(PetscMalloc1(nrqr, &isz1));
708   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
709   PetscCall(PetscFree(rbuf[0]));
710   PetscCall(PetscFree(rbuf));
711 
712   /* Send the data back */
713   /* Do a global reduction to know the buffer space req for incoming messages */
714   {
715     PetscMPIInt *rw1;
716 
717     PetscCall(PetscCalloc1(size, &rw1));
718 
719     for (i = 0; i < nrqr; ++i) {
720       proc = recv_status[i].MPI_SOURCE;
721 
722       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
723       rw1[proc] = isz1[i];
724     }
725     PetscCall(PetscFree(onodes1));
726     PetscCall(PetscFree(olengths1));
727 
728     /* Determine the number of messages to expect, their lengths, from from-ids */
729     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
730     PetscCall(PetscFree(rw1));
731   }
732   /* Now post the Irecvs corresponding to these messages */
733   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
734 
735   /* Now  post the sends */
736   PetscCall(PetscMalloc1(nrqr, &s_waits2));
737   for (i = 0; i < nrqr; ++i) {
738     j = recv_status[i].MPI_SOURCE;
739     PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
740   }
741 
742   /* receive work done on other processors */
743   {
744     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
745     PetscMPIInt idex;
746     PetscBT     table_i;
747 
748     for (i = 0; i < nrqs; ++i) {
749       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
750       /* Process the message */
751       rbuf2_i = rbuf2[idex];
752       ct1     = 2 * rbuf2_i[0] + 1;
753       jmax    = rbuf2[idex][0];
754       for (j = 1; j <= jmax; j++) {
755         max     = rbuf2_i[2 * j];
756         is_no   = rbuf2_i[2 * j - 1];
757         isz_i   = isz[is_no];
758         table_i = table[is_no];
759 #if defined(PETSC_USE_CTABLE)
760         table_data_i = table_data[is_no];
761 #else
762         data_i = data[is_no];
763 #endif
764         for (k = 0; k < max; k++, ct1++) {
765           row = rbuf2_i[ct1];
766           if (!PetscBTLookupSet(table_i, row)) {
767 #if defined(PETSC_USE_CTABLE)
768             PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES));
769 #else
770             data_i[isz_i] = row;
771 #endif
772             isz_i++;
773           }
774         }
775         isz[is_no] = isz_i;
776       }
777     }
778 
779     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
780   }
781 
782 #if defined(PETSC_USE_CTABLE)
783   tcount_max = 0;
784   for (i = 0; i < imax; ++i) {
785     table_data_i = table_data[i];
786     PetscCall(PetscTableGetCount(table_data_i, &tcount));
787     if (tcount_max < tcount) tcount_max = tcount;
788   }
789   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
790 #endif
791 
792   for (i = 0; i < imax; ++i) {
793 #if defined(PETSC_USE_CTABLE)
794     PetscTablePosition tpos;
795     table_data_i = table_data[i];
796 
797     PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos));
798     while (tpos) {
799       PetscCall(PetscTableGetNext(table_data_i, &tpos, &k, &j));
800       tdata[--j] = --k;
801     }
802     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
803 #else
804     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
805 #endif
806     PetscCall(PetscCommDestroy(&iscomms[i]));
807   }
808 
809   PetscCall(PetscFree(iscomms));
810   PetscCall(PetscFree(onodes2));
811   PetscCall(PetscFree(olengths2));
812 
813   PetscCall(PetscFree(pa));
814   PetscCall(PetscFree(rbuf2[0]));
815   PetscCall(PetscFree(rbuf2));
816   PetscCall(PetscFree(s_waits1));
817   PetscCall(PetscFree(r_waits1));
818   PetscCall(PetscFree(s_waits2));
819   PetscCall(PetscFree(r_waits2));
820   PetscCall(PetscFree(recv_status));
821   if (xdata) {
822     PetscCall(PetscFree(xdata[0]));
823     PetscCall(PetscFree(xdata));
824   }
825   PetscCall(PetscFree(isz1));
826 #if defined(PETSC_USE_CTABLE)
827   for (i = 0; i < imax; i++) PetscCall(PetscTableDestroy((PetscTable *)&table_data[i]));
828   PetscCall(PetscFree(table_data));
829   PetscCall(PetscFree(tdata));
830   PetscCall(PetscFree4(table, data, isz, t_p));
831 #else
832   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
833 #endif
834   PetscFunctionReturn(0);
835 }
836 
837 /*
838    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
839        the work on the local processor.
840 
841      Inputs:
842       C      - MAT_MPIAIJ;
843       imax - total no of index sets processed at a time;
844       table  - an array of char - size = m bits.
845 
846      Output:
847       isz    - array containing the count of the solution elements corresponding
848                to each index set;
849       data or table_data  - pointer to the solutions
850 */
851 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data)
852 {
853   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
854   Mat         A = c->A, B = c->B;
855   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
856   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
857   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
858   PetscBT     table_i;
859 #if defined(PETSC_USE_CTABLE)
860   PetscTable         table_data_i;
861   PetscTablePosition tpos;
862   PetscInt           tcount, *tdata;
863 #else
864   PetscInt *data_i;
865 #endif
866 
867   PetscFunctionBegin;
868   rstart = C->rmap->rstart;
869   cstart = C->cmap->rstart;
870   ai     = a->i;
871   aj     = a->j;
872   bi     = b->i;
873   bj     = b->j;
874   garray = c->garray;
875 
876   for (i = 0; i < imax; i++) {
877 #if defined(PETSC_USE_CTABLE)
878     /* copy existing entries of table_data_i into tdata[] */
879     table_data_i = table_data[i];
880     PetscCall(PetscTableGetCount(table_data_i, &tcount));
881     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
882 
883     PetscCall(PetscMalloc1(tcount, &tdata));
884     PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos));
885     while (tpos) {
886       PetscCall(PetscTableGetNext(table_data_i, &tpos, &row, &j));
887       tdata[--j] = --row;
888       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
889     }
890 #else
891     data_i = data[i];
892 #endif
893     table_i = table[i];
894     isz_i   = isz[i];
895     max     = isz[i];
896 
897     for (j = 0; j < max; j++) {
898 #if defined(PETSC_USE_CTABLE)
899       row = tdata[j] - rstart;
900 #else
901       row = data_i[j] - rstart;
902 #endif
903       start = ai[row];
904       end   = ai[row + 1];
905       for (k = start; k < end; k++) { /* Amat */
906         val = aj[k] + cstart;
907         if (!PetscBTLookupSet(table_i, val)) {
908 #if defined(PETSC_USE_CTABLE)
909           PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES));
910 #else
911           data_i[isz_i] = val;
912 #endif
913           isz_i++;
914         }
915       }
916       start = bi[row];
917       end   = bi[row + 1];
918       for (k = start; k < end; k++) { /* Bmat */
919         val = garray[bj[k]];
920         if (!PetscBTLookupSet(table_i, val)) {
921 #if defined(PETSC_USE_CTABLE)
922           PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES));
923 #else
924           data_i[isz_i] = val;
925 #endif
926           isz_i++;
927         }
928       }
929     }
930     isz[i] = isz_i;
931 
932 #if defined(PETSC_USE_CTABLE)
933     PetscCall(PetscFree(tdata));
934 #endif
935   }
936   PetscFunctionReturn(0);
937 }
938 
939 /*
940       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
941          and return the output
942 
943          Input:
944            C    - the matrix
945            nrqr - no of messages being processed.
946            rbuf - an array of pointers to the received requests
947 
948          Output:
949            xdata - array of messages to be sent back
950            isz1  - size of each message
951 
952   For better efficiency perhaps we should malloc separately each xdata[i],
953 then if a remalloc is required we need only copy the data for that one row
954 rather then all previous rows as it is now where a single large chunk of
955 memory is used.
956 
957 */
958 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
959 {
960   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
961   Mat         A = c->A, B = c->B;
962   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
963   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
964   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
965   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
966   PetscInt   *rbuf_i, kmax, rbuf_0;
967   PetscBT     xtable;
968 
969   PetscFunctionBegin;
970   m      = C->rmap->N;
971   rstart = C->rmap->rstart;
972   cstart = C->cmap->rstart;
973   ai     = a->i;
974   aj     = a->j;
975   bi     = b->i;
976   bj     = b->j;
977   garray = c->garray;
978 
979   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
980     rbuf_i = rbuf[i];
981     rbuf_0 = rbuf_i[0];
982     ct += rbuf_0;
983     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
984   }
985 
986   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
987   else max1 = 1;
988   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
989   if (nrqr) {
990     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
991     ++no_malloc;
992   }
993   PetscCall(PetscBTCreate(m, &xtable));
994   PetscCall(PetscArrayzero(isz1, nrqr));
995 
996   ct3 = 0;
997   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
998     rbuf_i = rbuf[i];
999     rbuf_0 = rbuf_i[0];
1000     ct1    = 2 * rbuf_0 + 1;
1001     ct2    = ct1;
1002     ct3 += ct1;
1003     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1004       PetscCall(PetscBTMemzero(m, xtable));
1005       oct2 = ct2;
1006       kmax = rbuf_i[2 * j];
1007       for (k = 0; k < kmax; k++, ct1++) {
1008         row = rbuf_i[ct1];
1009         if (!PetscBTLookupSet(xtable, row)) {
1010           if (!(ct3 < mem_estimate)) {
1011             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1012             PetscCall(PetscMalloc1(new_estimate, &tmp));
1013             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1014             PetscCall(PetscFree(xdata[0]));
1015             xdata[0]     = tmp;
1016             mem_estimate = new_estimate;
1017             ++no_malloc;
1018             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1019           }
1020           xdata[i][ct2++] = row;
1021           ct3++;
1022         }
1023       }
1024       for (k = oct2, max2 = ct2; k < max2; k++) {
1025         row   = xdata[i][k] - rstart;
1026         start = ai[row];
1027         end   = ai[row + 1];
1028         for (l = start; l < end; l++) {
1029           val = aj[l] + cstart;
1030           if (!PetscBTLookupSet(xtable, val)) {
1031             if (!(ct3 < mem_estimate)) {
1032               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1033               PetscCall(PetscMalloc1(new_estimate, &tmp));
1034               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1035               PetscCall(PetscFree(xdata[0]));
1036               xdata[0]     = tmp;
1037               mem_estimate = new_estimate;
1038               ++no_malloc;
1039               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1040             }
1041             xdata[i][ct2++] = val;
1042             ct3++;
1043           }
1044         }
1045         start = bi[row];
1046         end   = bi[row + 1];
1047         for (l = start; l < end; l++) {
1048           val = garray[bj[l]];
1049           if (!PetscBTLookupSet(xtable, val)) {
1050             if (!(ct3 < mem_estimate)) {
1051               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1052               PetscCall(PetscMalloc1(new_estimate, &tmp));
1053               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1054               PetscCall(PetscFree(xdata[0]));
1055               xdata[0]     = tmp;
1056               mem_estimate = new_estimate;
1057               ++no_malloc;
1058               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1059             }
1060             xdata[i][ct2++] = val;
1061             ct3++;
1062           }
1063         }
1064       }
1065       /* Update the header*/
1066       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1067       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1068     }
1069     xdata[i][0] = rbuf_0;
1070     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1071     isz1[i] = ct2; /* size of each message */
1072   }
1073   PetscCall(PetscBTDestroy(&xtable));
1074   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1075   PetscFunctionReturn(0);
1076 }
1077 /* -------------------------------------------------------------------------*/
1078 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1079 /*
1080     Every processor gets the entire matrix
1081 */
1082 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1083 {
1084   Mat         B;
1085   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1086   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1087   PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1088   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1089   PetscInt    m, *b_sendj, *garray   = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1090 
1091   PetscFunctionBegin;
1092   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1093   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1094   if (scall == MAT_INITIAL_MATRIX) {
1095     /* ----------------------------------------------------------------
1096          Tell every processor the number of nonzeros per row
1097     */
1098     PetscCall(PetscMalloc1(A->rmap->N, &lens));
1099     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1100     PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1101     for (i = 0; i < size; i++) {
1102       recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1103       displs[i]     = A->rmap->range[i];
1104     }
1105     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1106     /* ---------------------------------------------------------------
1107          Create the sequential matrix of the same type as the local block diagonal
1108     */
1109     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1110     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1111     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1112     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1113     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1114     PetscCall(PetscCalloc1(2, Bin));
1115     **Bin = B;
1116     b     = (Mat_SeqAIJ *)B->data;
1117 
1118     /*--------------------------------------------------------------------
1119        Copy my part of matrix column indices over
1120     */
1121     sendcount  = ad->nz + bd->nz;
1122     jsendbuf   = b->j + b->i[rstarts[rank]];
1123     a_jsendbuf = ad->j;
1124     b_jsendbuf = bd->j;
1125     n          = A->rmap->rend - A->rmap->rstart;
1126     cnt        = 0;
1127     for (i = 0; i < n; i++) {
1128       /* put in lower diagonal portion */
1129       m = bd->i[i + 1] - bd->i[i];
1130       while (m > 0) {
1131         /* is it above diagonal (in bd (compressed) numbering) */
1132         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1133         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1134         m--;
1135       }
1136 
1137       /* put in diagonal portion */
1138       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1139 
1140       /* put in upper diagonal portion */
1141       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1142     }
1143     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1144 
1145     /*--------------------------------------------------------------------
1146        Gather all column indices to all processors
1147     */
1148     for (i = 0; i < size; i++) {
1149       recvcounts[i] = 0;
1150       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1151     }
1152     displs[0] = 0;
1153     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1154     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1155     /*--------------------------------------------------------------------
1156         Assemble the matrix into useable form (note numerical values not yet set)
1157     */
1158     /* set the b->ilen (length of each row) values */
1159     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1160     /* set the b->i indices */
1161     b->i[0] = 0;
1162     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1163     PetscCall(PetscFree(lens));
1164     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1165     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1166 
1167   } else {
1168     B = **Bin;
1169     b = (Mat_SeqAIJ *)B->data;
1170   }
1171 
1172   /*--------------------------------------------------------------------
1173        Copy my part of matrix numerical values into the values location
1174   */
1175   if (flag == MAT_GET_VALUES) {
1176     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1177     MatScalar         *sendbuf, *recvbuf;
1178 
1179     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1180     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1181     sendcount = ad->nz + bd->nz;
1182     sendbuf   = b->a + b->i[rstarts[rank]];
1183     a_sendbuf = ada;
1184     b_sendbuf = bda;
1185     b_sendj   = bd->j;
1186     n         = A->rmap->rend - A->rmap->rstart;
1187     cnt       = 0;
1188     for (i = 0; i < n; i++) {
1189       /* put in lower diagonal portion */
1190       m = bd->i[i + 1] - bd->i[i];
1191       while (m > 0) {
1192         /* is it above diagonal (in bd (compressed) numbering) */
1193         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1194         sendbuf[cnt++] = *b_sendbuf++;
1195         m--;
1196         b_sendj++;
1197       }
1198 
1199       /* put in diagonal portion */
1200       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1201 
1202       /* put in upper diagonal portion */
1203       while (m-- > 0) {
1204         sendbuf[cnt++] = *b_sendbuf++;
1205         b_sendj++;
1206       }
1207     }
1208     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1209 
1210     /* -----------------------------------------------------------------
1211        Gather all numerical values to all processors
1212     */
1213     if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1214     for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1215     displs[0] = 0;
1216     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1217     recvbuf = b->a;
1218     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A)));
1219     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1220     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1221   } /* endof (flag == MAT_GET_VALUES) */
1222   PetscCall(PetscFree2(recvcounts, displs));
1223 
1224   PetscCall(MatPropagateSymmetryOptions(A, B));
1225   PetscFunctionReturn(0);
1226 }
1227 
1228 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1229 {
1230   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1231   Mat             submat, A = c->A, B = c->B;
1232   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1233   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1234   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1235   const PetscInt *icol, *irow;
1236   PetscInt        nrow, ncol, start;
1237   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1238   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1239   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1240   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1241   PetscInt       *lens, rmax, ncols, *cols, Crow;
1242 #if defined(PETSC_USE_CTABLE)
1243   PetscTable cmap, rmap;
1244   PetscInt  *cmap_loc, *rmap_loc;
1245 #else
1246   PetscInt *cmap, *rmap;
1247 #endif
1248   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1249   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1250   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1251   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1252   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1253   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1254   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1255   MPI_Comm      comm;
1256   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1257   PetscMPIInt  *onodes1, *olengths1, idex, end;
1258   Mat_SubSppt  *smatis1;
1259   PetscBool     isrowsorted, iscolsorted;
1260 
1261   PetscFunctionBegin;
1262   PetscValidLogicalCollectiveInt(C, ismax, 2);
1263   PetscValidLogicalCollectiveEnum(C, scall, 5);
1264   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1265   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1266   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1267   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1268   size = c->size;
1269   rank = c->rank;
1270 
1271   PetscCall(ISSorted(iscol[0], &iscolsorted));
1272   PetscCall(ISSorted(isrow[0], &isrowsorted));
1273   PetscCall(ISGetIndices(isrow[0], &irow));
1274   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1275   if (allcolumns) {
1276     icol = NULL;
1277     ncol = C->cmap->N;
1278   } else {
1279     PetscCall(ISGetIndices(iscol[0], &icol));
1280     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1281   }
1282 
1283   if (scall == MAT_INITIAL_MATRIX) {
1284     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1285 
1286     /* Get some new tags to keep the communication clean */
1287     tag1 = ((PetscObject)C)->tag;
1288     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1289     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1290 
1291     /* evaluate communication - mesg to who, length of mesg, and buffer space
1292      required. Based on this, buffers are allocated, and data copied into them */
1293     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1294     PetscCall(PetscMalloc1(nrow, &row2proc));
1295 
1296     /* w1[proc] = num of rows owned by proc -- to be requested */
1297     proc = 0;
1298     nrqs = 0; /* num of outgoing messages */
1299     for (j = 0; j < nrow; j++) {
1300       row = irow[j];
1301       if (!isrowsorted) proc = 0;
1302       while (row >= C->rmap->range[proc + 1]) proc++;
1303       w1[proc]++;
1304       row2proc[j] = proc; /* map row index to proc */
1305 
1306       if (proc != rank && !w2[proc]) {
1307         w2[proc] = 1;
1308         nrqs++;
1309       }
1310     }
1311     w1[rank] = 0; /* rows owned by self will not be requested */
1312 
1313     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1314     for (proc = 0, j = 0; proc < size; proc++) {
1315       if (w1[proc]) pa[j++] = proc;
1316     }
1317 
1318     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1319     msz = 0; /* total mesg length (for all procs) */
1320     for (i = 0; i < nrqs; i++) {
1321       proc = pa[i];
1322       w1[proc] += 3;
1323       msz += w1[proc];
1324     }
1325     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
1326 
1327     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1328     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1329     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1330 
1331     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1332        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1333     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1334 
1335     /* Now post the Irecvs corresponding to these messages */
1336     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1337 
1338     PetscCall(PetscFree(onodes1));
1339     PetscCall(PetscFree(olengths1));
1340 
1341     /* Allocate Memory for outgoing messages */
1342     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1343     PetscCall(PetscArrayzero(sbuf1, size));
1344     PetscCall(PetscArrayzero(ptr, size));
1345 
1346     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1347     iptr = tmp;
1348     for (i = 0; i < nrqs; i++) {
1349       proc        = pa[i];
1350       sbuf1[proc] = iptr;
1351       iptr += w1[proc];
1352     }
1353 
1354     /* Form the outgoing messages */
1355     /* Initialize the header space */
1356     for (i = 0; i < nrqs; i++) {
1357       proc = pa[i];
1358       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1359       ptr[proc] = sbuf1[proc] + 3;
1360     }
1361 
1362     /* Parse the isrow and copy data into outbuf */
1363     PetscCall(PetscArrayzero(ctr, size));
1364     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1365       proc = row2proc[j];
1366       if (proc != rank) { /* copy to the outgoing buf*/
1367         *ptr[proc] = irow[j];
1368         ctr[proc]++;
1369         ptr[proc]++;
1370       }
1371     }
1372 
1373     /* Update the headers for the current IS */
1374     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1375       if ((ctr_j = ctr[j])) {
1376         sbuf1_j            = sbuf1[j];
1377         k                  = ++sbuf1_j[0];
1378         sbuf1_j[2 * k]     = ctr_j;
1379         sbuf1_j[2 * k - 1] = 0;
1380       }
1381     }
1382 
1383     /* Now post the sends */
1384     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1385     for (i = 0; i < nrqs; ++i) {
1386       proc = pa[i];
1387       PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1388     }
1389 
1390     /* Post Receives to capture the buffer size */
1391     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1392     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1393 
1394     if (nrqs) rbuf2[0] = tmp + msz;
1395     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1396 
1397     for (i = 0; i < nrqs; ++i) {
1398       proc = pa[i];
1399       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1400     }
1401 
1402     PetscCall(PetscFree2(w1, w2));
1403 
1404     /* Send to other procs the buf size they should allocate */
1405     /* Receive messages*/
1406     PetscCall(PetscMalloc1(nrqr, &r_status1));
1407     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1408 
1409     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1410     for (i = 0; i < nrqr; ++i) {
1411       req_size[i] = 0;
1412       rbuf1_i     = rbuf1[i];
1413       start       = 2 * rbuf1_i[0] + 1;
1414       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1415       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1416       sbuf2_i = sbuf2[i];
1417       for (j = start; j < end; j++) {
1418         k          = rbuf1_i[j] - rstart;
1419         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1420         sbuf2_i[j] = ncols;
1421         req_size[i] += ncols;
1422       }
1423       req_source1[i] = r_status1[i].MPI_SOURCE;
1424 
1425       /* form the header */
1426       sbuf2_i[0] = req_size[i];
1427       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1428 
1429       PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1430     }
1431 
1432     PetscCall(PetscFree(r_status1));
1433     PetscCall(PetscFree(r_waits1));
1434 
1435     /* rbuf2 is received, Post recv column indices a->j */
1436     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1437 
1438     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1439     for (i = 0; i < nrqs; ++i) {
1440       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1441       req_source2[i] = r_status2[i].MPI_SOURCE;
1442       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1443     }
1444 
1445     /* Wait on sends1 and sends2 */
1446     PetscCall(PetscMalloc1(nrqs, &s_status1));
1447     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1448     PetscCall(PetscFree(s_waits1));
1449     PetscCall(PetscFree(s_status1));
1450 
1451     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1452     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1453 
1454     /* Now allocate sending buffers for a->j, and send them off */
1455     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1456     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1457     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1458     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1459 
1460     for (i = 0; i < nrqr; i++) { /* for each requested message */
1461       rbuf1_i   = rbuf1[i];
1462       sbuf_aj_i = sbuf_aj[i];
1463       ct1       = 2 * rbuf1_i[0] + 1;
1464       ct2       = 0;
1465       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1466 
1467       kmax = rbuf1[i][2];
1468       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1469         row    = rbuf1_i[ct1] - rstart;
1470         nzA    = ai[row + 1] - ai[row];
1471         nzB    = bi[row + 1] - bi[row];
1472         ncols  = nzA + nzB;
1473         cworkA = aj + ai[row];
1474         cworkB = bj + bi[row];
1475 
1476         /* load the column indices for this row into cols*/
1477         cols = sbuf_aj_i + ct2;
1478 
1479         lwrite = 0;
1480         for (l = 0; l < nzB; l++) {
1481           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1482         }
1483         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1484         for (l = 0; l < nzB; l++) {
1485           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1486         }
1487 
1488         ct2 += ncols;
1489       }
1490       PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1491     }
1492 
1493     /* create column map (cmap): global col of C -> local col of submat */
1494 #if defined(PETSC_USE_CTABLE)
1495     if (!allcolumns) {
1496       PetscCall(PetscTableCreate(ncol, C->cmap->N, &cmap));
1497       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1498       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1499         if (icol[j] >= cstart && icol[j] < cend) {
1500           cmap_loc[icol[j] - cstart] = j + 1;
1501         } else { /* use PetscTable for non-local col indices */
1502           PetscCall(PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES));
1503         }
1504       }
1505     } else {
1506       cmap     = NULL;
1507       cmap_loc = NULL;
1508     }
1509     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1510 #else
1511     if (!allcolumns) {
1512       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1513       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1514     } else {
1515       cmap = NULL;
1516     }
1517 #endif
1518 
1519     /* Create lens for MatSeqAIJSetPreallocation() */
1520     PetscCall(PetscCalloc1(nrow, &lens));
1521 
1522     /* Compute lens from local part of C */
1523     for (j = 0; j < nrow; j++) {
1524       row  = irow[j];
1525       proc = row2proc[j];
1526       if (proc == rank) {
1527         /* diagonal part A = c->A */
1528         ncols = ai[row - rstart + 1] - ai[row - rstart];
1529         cols  = aj + ai[row - rstart];
1530         if (!allcolumns) {
1531           for (k = 0; k < ncols; k++) {
1532 #if defined(PETSC_USE_CTABLE)
1533             tcol = cmap_loc[cols[k]];
1534 #else
1535             tcol = cmap[cols[k] + cstart];
1536 #endif
1537             if (tcol) lens[j]++;
1538           }
1539         } else { /* allcolumns */
1540           lens[j] = ncols;
1541         }
1542 
1543         /* off-diagonal part B = c->B */
1544         ncols = bi[row - rstart + 1] - bi[row - rstart];
1545         cols  = bj + bi[row - rstart];
1546         if (!allcolumns) {
1547           for (k = 0; k < ncols; k++) {
1548 #if defined(PETSC_USE_CTABLE)
1549             PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol));
1550 #else
1551             tcol = cmap[bmap[cols[k]]];
1552 #endif
1553             if (tcol) lens[j]++;
1554           }
1555         } else { /* allcolumns */
1556           lens[j] += ncols;
1557         }
1558       }
1559     }
1560 
1561     /* Create row map (rmap): global row of C -> local row of submat */
1562 #if defined(PETSC_USE_CTABLE)
1563     PetscCall(PetscTableCreate(nrow, C->rmap->N, &rmap));
1564     for (j = 0; j < nrow; j++) {
1565       row  = irow[j];
1566       proc = row2proc[j];
1567       if (proc == rank) { /* a local row */
1568         rmap_loc[row - rstart] = j;
1569       } else {
1570         PetscCall(PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES));
1571       }
1572     }
1573 #else
1574     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1575     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1576 #endif
1577 
1578     /* Update lens from offproc data */
1579     /* recv a->j is done */
1580     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1581     for (i = 0; i < nrqs; i++) {
1582       proc    = pa[i];
1583       sbuf1_i = sbuf1[proc];
1584       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1585       ct1     = 2 + 1;
1586       ct2     = 0;
1587       rbuf2_i = rbuf2[i]; /* received length of C->j */
1588       rbuf3_i = rbuf3[i]; /* received C->j */
1589 
1590       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1591       max1 = sbuf1_i[2];
1592       for (k = 0; k < max1; k++, ct1++) {
1593 #if defined(PETSC_USE_CTABLE)
1594         PetscCall(PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row));
1595         row--;
1596         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1597 #else
1598         row = rmap[sbuf1_i[ct1]];      /* the row index in submat */
1599 #endif
1600         /* Now, store row index of submat in sbuf1_i[ct1] */
1601         sbuf1_i[ct1] = row;
1602 
1603         nnz = rbuf2_i[ct1];
1604         if (!allcolumns) {
1605           for (l = 0; l < nnz; l++, ct2++) {
1606 #if defined(PETSC_USE_CTABLE)
1607             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1608               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1609             } else {
1610               PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol));
1611             }
1612 #else
1613             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1614 #endif
1615             if (tcol) lens[row]++;
1616           }
1617         } else { /* allcolumns */
1618           lens[row] += nnz;
1619         }
1620       }
1621     }
1622     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1623     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1624 
1625     /* Create the submatrices */
1626     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1627     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1628 
1629     PetscCall(ISGetBlockSize(isrow[0], &i));
1630     PetscCall(ISGetBlockSize(iscol[0], &j));
1631     PetscCall(MatSetBlockSizes(submat, i, j));
1632     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1633     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1634 
1635     /* create struct Mat_SubSppt and attached it to submat */
1636     PetscCall(PetscNew(&smatis1));
1637     subc            = (Mat_SeqAIJ *)submat->data;
1638     subc->submatis1 = smatis1;
1639 
1640     smatis1->id          = 0;
1641     smatis1->nrqs        = nrqs;
1642     smatis1->nrqr        = nrqr;
1643     smatis1->rbuf1       = rbuf1;
1644     smatis1->rbuf2       = rbuf2;
1645     smatis1->rbuf3       = rbuf3;
1646     smatis1->sbuf2       = sbuf2;
1647     smatis1->req_source2 = req_source2;
1648 
1649     smatis1->sbuf1 = sbuf1;
1650     smatis1->ptr   = ptr;
1651     smatis1->tmp   = tmp;
1652     smatis1->ctr   = ctr;
1653 
1654     smatis1->pa          = pa;
1655     smatis1->req_size    = req_size;
1656     smatis1->req_source1 = req_source1;
1657 
1658     smatis1->allcolumns = allcolumns;
1659     smatis1->singleis   = PETSC_TRUE;
1660     smatis1->row2proc   = row2proc;
1661     smatis1->rmap       = rmap;
1662     smatis1->cmap       = cmap;
1663 #if defined(PETSC_USE_CTABLE)
1664     smatis1->rmap_loc = rmap_loc;
1665     smatis1->cmap_loc = cmap_loc;
1666 #endif
1667 
1668     smatis1->destroy     = submat->ops->destroy;
1669     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1670     submat->factortype   = C->factortype;
1671 
1672     /* compute rmax */
1673     rmax = 0;
1674     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1675 
1676   } else { /* scall == MAT_REUSE_MATRIX */
1677     submat = submats[0];
1678     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1679 
1680     subc        = (Mat_SeqAIJ *)submat->data;
1681     rmax        = subc->rmax;
1682     smatis1     = subc->submatis1;
1683     nrqs        = smatis1->nrqs;
1684     nrqr        = smatis1->nrqr;
1685     rbuf1       = smatis1->rbuf1;
1686     rbuf2       = smatis1->rbuf2;
1687     rbuf3       = smatis1->rbuf3;
1688     req_source2 = smatis1->req_source2;
1689 
1690     sbuf1 = smatis1->sbuf1;
1691     sbuf2 = smatis1->sbuf2;
1692     ptr   = smatis1->ptr;
1693     tmp   = smatis1->tmp;
1694     ctr   = smatis1->ctr;
1695 
1696     pa          = smatis1->pa;
1697     req_size    = smatis1->req_size;
1698     req_source1 = smatis1->req_source1;
1699 
1700     allcolumns = smatis1->allcolumns;
1701     row2proc   = smatis1->row2proc;
1702     rmap       = smatis1->rmap;
1703     cmap       = smatis1->cmap;
1704 #if defined(PETSC_USE_CTABLE)
1705     rmap_loc = smatis1->rmap_loc;
1706     cmap_loc = smatis1->cmap_loc;
1707 #endif
1708   }
1709 
1710   /* Post recv matrix values */
1711   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1712   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1713   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1714   for (i = 0; i < nrqs; ++i) {
1715     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1716     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1717   }
1718 
1719   /* Allocate sending buffers for a->a, and send them off */
1720   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1721   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1722   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1723   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1724 
1725   for (i = 0; i < nrqr; i++) {
1726     rbuf1_i   = rbuf1[i];
1727     sbuf_aa_i = sbuf_aa[i];
1728     ct1       = 2 * rbuf1_i[0] + 1;
1729     ct2       = 0;
1730     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1731 
1732     kmax = rbuf1_i[2];
1733     for (k = 0; k < kmax; k++, ct1++) {
1734       row    = rbuf1_i[ct1] - rstart;
1735       nzA    = ai[row + 1] - ai[row];
1736       nzB    = bi[row + 1] - bi[row];
1737       ncols  = nzA + nzB;
1738       cworkB = bj + bi[row];
1739       vworkA = a_a + ai[row];
1740       vworkB = b_a + bi[row];
1741 
1742       /* load the column values for this row into vals*/
1743       vals = sbuf_aa_i + ct2;
1744 
1745       lwrite = 0;
1746       for (l = 0; l < nzB; l++) {
1747         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1748       }
1749       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1750       for (l = 0; l < nzB; l++) {
1751         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1752       }
1753 
1754       ct2 += ncols;
1755     }
1756     PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1757   }
1758 
1759   /* Assemble submat */
1760   /* First assemble the local rows */
1761   for (j = 0; j < nrow; j++) {
1762     row  = irow[j];
1763     proc = row2proc[j];
1764     if (proc == rank) {
1765       Crow = row - rstart; /* local row index of C */
1766 #if defined(PETSC_USE_CTABLE)
1767       row = rmap_loc[Crow]; /* row index of submat */
1768 #else
1769       row = rmap[row];
1770 #endif
1771 
1772       if (allcolumns) {
1773         /* diagonal part A = c->A */
1774         ncols = ai[Crow + 1] - ai[Crow];
1775         cols  = aj + ai[Crow];
1776         vals  = a_a + ai[Crow];
1777         i     = 0;
1778         for (k = 0; k < ncols; k++) {
1779           subcols[i]   = cols[k] + cstart;
1780           subvals[i++] = vals[k];
1781         }
1782 
1783         /* off-diagonal part B = c->B */
1784         ncols = bi[Crow + 1] - bi[Crow];
1785         cols  = bj + bi[Crow];
1786         vals  = b_a + bi[Crow];
1787         for (k = 0; k < ncols; k++) {
1788           subcols[i]   = bmap[cols[k]];
1789           subvals[i++] = vals[k];
1790         }
1791 
1792         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1793 
1794       } else { /* !allcolumns */
1795 #if defined(PETSC_USE_CTABLE)
1796         /* diagonal part A = c->A */
1797         ncols = ai[Crow + 1] - ai[Crow];
1798         cols  = aj + ai[Crow];
1799         vals  = a_a + ai[Crow];
1800         i     = 0;
1801         for (k = 0; k < ncols; k++) {
1802           tcol = cmap_loc[cols[k]];
1803           if (tcol) {
1804             subcols[i]   = --tcol;
1805             subvals[i++] = vals[k];
1806           }
1807         }
1808 
1809         /* off-diagonal part B = c->B */
1810         ncols = bi[Crow + 1] - bi[Crow];
1811         cols  = bj + bi[Crow];
1812         vals  = b_a + bi[Crow];
1813         for (k = 0; k < ncols; k++) {
1814           PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol));
1815           if (tcol) {
1816             subcols[i]   = --tcol;
1817             subvals[i++] = vals[k];
1818           }
1819         }
1820 #else
1821         /* diagonal part A = c->A */
1822         ncols = ai[Crow + 1] - ai[Crow];
1823         cols  = aj + ai[Crow];
1824         vals  = a_a + ai[Crow];
1825         i     = 0;
1826         for (k = 0; k < ncols; k++) {
1827           tcol = cmap[cols[k] + cstart];
1828           if (tcol) {
1829             subcols[i]   = --tcol;
1830             subvals[i++] = vals[k];
1831           }
1832         }
1833 
1834         /* off-diagonal part B = c->B */
1835         ncols = bi[Crow + 1] - bi[Crow];
1836         cols  = bj + bi[Crow];
1837         vals  = b_a + bi[Crow];
1838         for (k = 0; k < ncols; k++) {
1839           tcol = cmap[bmap[cols[k]]];
1840           if (tcol) {
1841             subcols[i]   = --tcol;
1842             subvals[i++] = vals[k];
1843           }
1844         }
1845 #endif
1846         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1847       }
1848     }
1849   }
1850 
1851   /* Now assemble the off-proc rows */
1852   for (i = 0; i < nrqs; i++) { /* for each requested message */
1853     /* recv values from other processes */
1854     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1855     proc    = pa[idex];
1856     sbuf1_i = sbuf1[proc];
1857     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1858     ct1     = 2 + 1;
1859     ct2     = 0;           /* count of received C->j */
1860     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1861     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1862     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1863     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1864 
1865     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1866     max1 = sbuf1_i[2];                  /* num of rows */
1867     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1868       row = sbuf1_i[ct1];               /* row index of submat */
1869       if (!allcolumns) {
1870         idex = 0;
1871         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1872           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1873           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1874 #if defined(PETSC_USE_CTABLE)
1875             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1876               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1877             } else {
1878               PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol));
1879             }
1880 #else
1881             tcol = cmap[rbuf3_i[ct2]];
1882 #endif
1883             if (tcol) {
1884               subcols[idex]   = --tcol; /* may not be sorted */
1885               subvals[idex++] = rbuf4_i[ct2];
1886 
1887               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1888                For reuse, we replace received C->j with index that should be inserted to submat */
1889               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1890             }
1891           }
1892           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1893         } else { /* scall == MAT_REUSE_MATRIX */
1894           submat = submats[0];
1895           subc   = (Mat_SeqAIJ *)submat->data;
1896 
1897           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1898           for (l = 0; l < nnz; l++) {
1899             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1900             subvals[idex++] = rbuf4_i[ct2];
1901           }
1902 
1903           bj = subc->j + subc->i[row]; /* sorted column indices */
1904           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1905         }
1906       } else {              /* allcolumns */
1907         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1908         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES));
1909         ct2 += nnz;
1910       }
1911     }
1912   }
1913 
1914   /* sending a->a are done */
1915   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1916   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1917 
1918   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1919   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1920   submats[0] = submat;
1921 
1922   /* Restore the indices */
1923   PetscCall(ISRestoreIndices(isrow[0], &irow));
1924   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1925 
1926   /* Destroy allocated memory */
1927   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1928   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1929   if (sbuf_aa) {
1930     PetscCall(PetscFree(sbuf_aa[0]));
1931     PetscCall(PetscFree(sbuf_aa));
1932   }
1933 
1934   if (scall == MAT_INITIAL_MATRIX) {
1935     PetscCall(PetscFree(lens));
1936     if (sbuf_aj) {
1937       PetscCall(PetscFree(sbuf_aj[0]));
1938       PetscCall(PetscFree(sbuf_aj));
1939     }
1940   }
1941   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1942   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1943   PetscFunctionReturn(0);
1944 }
1945 
1946 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1947 {
1948   PetscInt  ncol;
1949   PetscBool colflag, allcolumns = PETSC_FALSE;
1950 
1951   PetscFunctionBegin;
1952   /* Allocate memory to hold all the submatrices */
1953   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1954 
1955   /* Check for special case: each processor gets entire matrix columns */
1956   PetscCall(ISIdentity(iscol[0], &colflag));
1957   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1958   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1959 
1960   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1961   PetscFunctionReturn(0);
1962 }
1963 
1964 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1965 {
1966   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1967   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1968   Mat_SeqAIJ  *subc;
1969   Mat_SubSppt *smat;
1970 
1971   PetscFunctionBegin;
1972   /* Check for special case: each processor has a single IS */
1973   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1974     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1975     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1976     PetscFunctionReturn(0);
1977   }
1978 
1979   /* Collect global wantallmatrix and nstages */
1980   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1981   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1982   if (!nmax) nmax = 1;
1983 
1984   if (scall == MAT_INITIAL_MATRIX) {
1985     /* Collect global wantallmatrix and nstages */
1986     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1987       PetscCall(ISIdentity(*isrow, &rowflag));
1988       PetscCall(ISIdentity(*iscol, &colflag));
1989       PetscCall(ISGetLocalSize(*isrow, &nrow));
1990       PetscCall(ISGetLocalSize(*iscol, &ncol));
1991       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1992         wantallmatrix = PETSC_TRUE;
1993 
1994         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1995       }
1996     }
1997 
1998     /* Determine the number of stages through which submatrices are done
1999        Each stage will extract nmax submatrices.
2000        nmax is determined by the matrix column dimension.
2001        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2002     */
2003     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2004 
2005     in[0] = -1 * (PetscInt)wantallmatrix;
2006     in[1] = nstages;
2007     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
2008     wantallmatrix = (PetscBool)(-out[0]);
2009     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2010 
2011   } else { /* MAT_REUSE_MATRIX */
2012     if (ismax) {
2013       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2014       smat = subc->submatis1;
2015     } else { /* (*submat)[0] is a dummy matrix */
2016       smat = (Mat_SubSppt *)(*submat)[0]->data;
2017     }
2018     if (!smat) {
2019       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2020       wantallmatrix = PETSC_TRUE;
2021     } else if (smat->singleis) {
2022       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2023       PetscFunctionReturn(0);
2024     } else {
2025       nstages = smat->nstages;
2026     }
2027   }
2028 
2029   if (wantallmatrix) {
2030     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2031     PetscFunctionReturn(0);
2032   }
2033 
2034   /* Allocate memory to hold all the submatrices and dummy submatrices */
2035   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2036 
2037   for (i = 0, pos = 0; i < nstages; i++) {
2038     if (pos + nmax <= ismax) max_no = nmax;
2039     else if (pos >= ismax) max_no = 0;
2040     else max_no = ismax - pos;
2041 
2042     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos));
2043     if (!max_no) {
2044       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2045         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2046         smat->nstages = nstages;
2047       }
2048       pos++; /* advance to next dummy matrix if any */
2049     } else pos += max_no;
2050   }
2051 
2052   if (ismax && scall == MAT_INITIAL_MATRIX) {
2053     /* save nstages for reuse */
2054     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2055     smat          = subc->submatis1;
2056     smat->nstages = nstages;
2057   }
2058   PetscFunctionReturn(0);
2059 }
2060 
2061 /* -------------------------------------------------------------------------*/
2062 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2063 {
2064   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2065   Mat              A = c->A;
2066   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2067   const PetscInt **icol, **irow;
2068   PetscInt        *nrow, *ncol, start;
2069   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2070   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2071   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2072   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2073   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2074 #if defined(PETSC_USE_CTABLE)
2075   PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i;
2076 #else
2077   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2078 #endif
2079   const PetscInt *irow_i;
2080   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2081   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2082   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2083   MPI_Comm        comm;
2084   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2085   PetscMPIInt    *onodes1, *olengths1, end;
2086   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2087   Mat_SubSppt    *smat_i;
2088   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2089   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2090 
2091   PetscFunctionBegin;
2092   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2093   size = c->size;
2094   rank = c->rank;
2095 
2096   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2097   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2098 
2099   for (i = 0; i < ismax; i++) {
2100     PetscCall(ISSorted(iscol[i], &issorted[i]));
2101     if (!issorted[i]) iscsorted = issorted[i];
2102 
2103     PetscCall(ISSorted(isrow[i], &issorted[i]));
2104 
2105     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2106     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2107 
2108     /* Check for special case: allcolumn */
2109     PetscCall(ISIdentity(iscol[i], &colflag));
2110     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2111     if (colflag && ncol[i] == C->cmap->N) {
2112       allcolumns[i] = PETSC_TRUE;
2113       icol[i]       = NULL;
2114     } else {
2115       allcolumns[i] = PETSC_FALSE;
2116       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2117     }
2118   }
2119 
2120   if (scall == MAT_REUSE_MATRIX) {
2121     /* Assumes new rows are same length as the old rows */
2122     for (i = 0; i < ismax; i++) {
2123       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2124       subc = (Mat_SeqAIJ *)submats[i]->data;
2125       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2126 
2127       /* Initial matrix as if empty */
2128       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2129 
2130       smat_i = subc->submatis1;
2131 
2132       nrqs        = smat_i->nrqs;
2133       nrqr        = smat_i->nrqr;
2134       rbuf1       = smat_i->rbuf1;
2135       rbuf2       = smat_i->rbuf2;
2136       rbuf3       = smat_i->rbuf3;
2137       req_source2 = smat_i->req_source2;
2138 
2139       sbuf1 = smat_i->sbuf1;
2140       sbuf2 = smat_i->sbuf2;
2141       ptr   = smat_i->ptr;
2142       tmp   = smat_i->tmp;
2143       ctr   = smat_i->ctr;
2144 
2145       pa          = smat_i->pa;
2146       req_size    = smat_i->req_size;
2147       req_source1 = smat_i->req_source1;
2148 
2149       allcolumns[i] = smat_i->allcolumns;
2150       row2proc[i]   = smat_i->row2proc;
2151       rmap[i]       = smat_i->rmap;
2152       cmap[i]       = smat_i->cmap;
2153     }
2154 
2155     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2156       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2157       smat_i = (Mat_SubSppt *)submats[0]->data;
2158 
2159       nrqs        = smat_i->nrqs;
2160       nrqr        = smat_i->nrqr;
2161       rbuf1       = smat_i->rbuf1;
2162       rbuf2       = smat_i->rbuf2;
2163       rbuf3       = smat_i->rbuf3;
2164       req_source2 = smat_i->req_source2;
2165 
2166       sbuf1 = smat_i->sbuf1;
2167       sbuf2 = smat_i->sbuf2;
2168       ptr   = smat_i->ptr;
2169       tmp   = smat_i->tmp;
2170       ctr   = smat_i->ctr;
2171 
2172       pa          = smat_i->pa;
2173       req_size    = smat_i->req_size;
2174       req_source1 = smat_i->req_source1;
2175 
2176       allcolumns[0] = PETSC_FALSE;
2177     }
2178   } else { /* scall == MAT_INITIAL_MATRIX */
2179     /* Get some new tags to keep the communication clean */
2180     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2181     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2182 
2183     /* evaluate communication - mesg to who, length of mesg, and buffer space
2184      required. Based on this, buffers are allocated, and data copied into them*/
2185     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2186 
2187     for (i = 0; i < ismax; i++) {
2188       jmax   = nrow[i];
2189       irow_i = irow[i];
2190 
2191       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2192       row2proc[i] = row2proc_i;
2193 
2194       if (issorted[i]) proc = 0;
2195       for (j = 0; j < jmax; j++) {
2196         if (!issorted[i]) proc = 0;
2197         row = irow_i[j];
2198         while (row >= C->rmap->range[proc + 1]) proc++;
2199         w4[proc]++;
2200         row2proc_i[j] = proc; /* map row index to proc */
2201       }
2202       for (j = 0; j < size; j++) {
2203         if (w4[j]) {
2204           w1[j] += w4[j];
2205           w3[j]++;
2206           w4[j] = 0;
2207         }
2208       }
2209     }
2210 
2211     nrqs     = 0; /* no of outgoing messages */
2212     msz      = 0; /* total mesg length (for all procs) */
2213     w1[rank] = 0; /* no mesg sent to self */
2214     w3[rank] = 0;
2215     for (i = 0; i < size; i++) {
2216       if (w1[i]) {
2217         w2[i] = 1;
2218         nrqs++;
2219       } /* there exists a message to proc i */
2220     }
2221     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2222     for (i = 0, j = 0; i < size; i++) {
2223       if (w1[i]) {
2224         pa[j] = i;
2225         j++;
2226       }
2227     }
2228 
2229     /* Each message would have a header = 1 + 2*(no of IS) + data */
2230     for (i = 0; i < nrqs; i++) {
2231       j = pa[i];
2232       w1[j] += w2[j] + 2 * w3[j];
2233       msz += w1[j];
2234     }
2235     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
2236 
2237     /* Determine the number of messages to expect, their lengths, from from-ids */
2238     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2239     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2240 
2241     /* Now post the Irecvs corresponding to these messages */
2242     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2243     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2244 
2245     /* Allocate Memory for outgoing messages */
2246     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2247     PetscCall(PetscArrayzero(sbuf1, size));
2248     PetscCall(PetscArrayzero(ptr, size));
2249 
2250     {
2251       PetscInt *iptr = tmp;
2252       k              = 0;
2253       for (i = 0; i < nrqs; i++) {
2254         j = pa[i];
2255         iptr += k;
2256         sbuf1[j] = iptr;
2257         k        = w1[j];
2258       }
2259     }
2260 
2261     /* Form the outgoing messages. Initialize the header space */
2262     for (i = 0; i < nrqs; i++) {
2263       j           = pa[i];
2264       sbuf1[j][0] = 0;
2265       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2266       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2267     }
2268 
2269     /* Parse the isrow and copy data into outbuf */
2270     for (i = 0; i < ismax; i++) {
2271       row2proc_i = row2proc[i];
2272       PetscCall(PetscArrayzero(ctr, size));
2273       irow_i = irow[i];
2274       jmax   = nrow[i];
2275       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2276         proc = row2proc_i[j];
2277         if (proc != rank) { /* copy to the outgoing buf*/
2278           ctr[proc]++;
2279           *ptr[proc] = irow_i[j];
2280           ptr[proc]++;
2281         }
2282       }
2283       /* Update the headers for the current IS */
2284       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2285         if ((ctr_j = ctr[j])) {
2286           sbuf1_j            = sbuf1[j];
2287           k                  = ++sbuf1_j[0];
2288           sbuf1_j[2 * k]     = ctr_j;
2289           sbuf1_j[2 * k - 1] = i;
2290         }
2291       }
2292     }
2293 
2294     /*  Now  post the sends */
2295     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2296     for (i = 0; i < nrqs; ++i) {
2297       j = pa[i];
2298       PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2299     }
2300 
2301     /* Post Receives to capture the buffer size */
2302     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2303     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2304     if (nrqs) rbuf2[0] = tmp + msz;
2305     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2306     for (i = 0; i < nrqs; ++i) {
2307       j = pa[i];
2308       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2309     }
2310 
2311     /* Send to other procs the buf size they should allocate */
2312     /* Receive messages*/
2313     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2314     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2315     {
2316       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2317       PetscInt *sbuf2_i;
2318 
2319       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2320       for (i = 0; i < nrqr; ++i) {
2321         req_size[i] = 0;
2322         rbuf1_i     = rbuf1[i];
2323         start       = 2 * rbuf1_i[0] + 1;
2324         end         = olengths1[i];
2325         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2326         sbuf2_i = sbuf2[i];
2327         for (j = start; j < end; j++) {
2328           id         = rbuf1_i[j] - rstart;
2329           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2330           sbuf2_i[j] = ncols;
2331           req_size[i] += ncols;
2332         }
2333         req_source1[i] = onodes1[i];
2334         /* form the header */
2335         sbuf2_i[0] = req_size[i];
2336         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2337 
2338         PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2339       }
2340     }
2341 
2342     PetscCall(PetscFree(onodes1));
2343     PetscCall(PetscFree(olengths1));
2344     PetscCall(PetscFree(r_waits1));
2345     PetscCall(PetscFree4(w1, w2, w3, w4));
2346 
2347     /* Receive messages*/
2348     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2349     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2350     for (i = 0; i < nrqs; ++i) {
2351       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2352       req_source2[i] = pa[i];
2353       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2354     }
2355     PetscCall(PetscFree(r_waits2));
2356 
2357     /* Wait on sends1 and sends2 */
2358     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2359     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2360     PetscCall(PetscFree(s_waits1));
2361     PetscCall(PetscFree(s_waits2));
2362 
2363     /* Now allocate sending buffers for a->j, and send them off */
2364     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2365     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2366     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2367     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2368 
2369     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2370     {
2371       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2372       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2373       PetscInt  cend = C->cmap->rend;
2374       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2375 
2376       for (i = 0; i < nrqr; i++) {
2377         rbuf1_i   = rbuf1[i];
2378         sbuf_aj_i = sbuf_aj[i];
2379         ct1       = 2 * rbuf1_i[0] + 1;
2380         ct2       = 0;
2381         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2382           kmax = rbuf1[i][2 * j];
2383           for (k = 0; k < kmax; k++, ct1++) {
2384             row    = rbuf1_i[ct1] - rstart;
2385             nzA    = a_i[row + 1] - a_i[row];
2386             nzB    = b_i[row + 1] - b_i[row];
2387             ncols  = nzA + nzB;
2388             cworkA = a_j + a_i[row];
2389             cworkB = b_j + b_i[row];
2390 
2391             /* load the column indices for this row into cols */
2392             cols = sbuf_aj_i + ct2;
2393 
2394             lwrite = 0;
2395             for (l = 0; l < nzB; l++) {
2396               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2397             }
2398             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2399             for (l = 0; l < nzB; l++) {
2400               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2401             }
2402 
2403             ct2 += ncols;
2404           }
2405         }
2406         PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2407       }
2408     }
2409 
2410     /* create col map: global col of C -> local col of submatrices */
2411     {
2412       const PetscInt *icol_i;
2413 #if defined(PETSC_USE_CTABLE)
2414       for (i = 0; i < ismax; i++) {
2415         if (!allcolumns[i]) {
2416           PetscCall(PetscTableCreate(ncol[i], C->cmap->N, &cmap[i]));
2417 
2418           jmax   = ncol[i];
2419           icol_i = icol[i];
2420           cmap_i = cmap[i];
2421           for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES));
2422         } else cmap[i] = NULL;
2423       }
2424 #else
2425       for (i = 0; i < ismax; i++) {
2426         if (!allcolumns[i]) {
2427           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2428           jmax   = ncol[i];
2429           icol_i = icol[i];
2430           cmap_i = cmap[i];
2431           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2432         } else cmap[i] = NULL;
2433       }
2434 #endif
2435     }
2436 
2437     /* Create lens which is required for MatCreate... */
2438     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2439     PetscCall(PetscMalloc1(ismax, &lens));
2440 
2441     if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2442     for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];
2443 
2444     /* Update lens from local data */
2445     for (i = 0; i < ismax; i++) {
2446       row2proc_i = row2proc[i];
2447       jmax       = nrow[i];
2448       if (!allcolumns[i]) cmap_i = cmap[i];
2449       irow_i = irow[i];
2450       lens_i = lens[i];
2451       for (j = 0; j < jmax; j++) {
2452         row  = irow_i[j];
2453         proc = row2proc_i[j];
2454         if (proc == rank) {
2455           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2456           if (!allcolumns[i]) {
2457             for (k = 0; k < ncols; k++) {
2458 #if defined(PETSC_USE_CTABLE)
2459               PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol));
2460 #else
2461               tcol = cmap_i[cols[k]];
2462 #endif
2463               if (tcol) lens_i[j]++;
2464             }
2465           } else { /* allcolumns */
2466             lens_i[j] = ncols;
2467           }
2468           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2469         }
2470       }
2471     }
2472 
2473     /* Create row map: global row of C -> local row of submatrices */
2474 #if defined(PETSC_USE_CTABLE)
2475     for (i = 0; i < ismax; i++) {
2476       PetscCall(PetscTableCreate(nrow[i], C->rmap->N, &rmap[i]));
2477       irow_i = irow[i];
2478       jmax   = nrow[i];
2479       for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES));
2480     }
2481 #else
2482     for (i = 0; i < ismax; i++) {
2483       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2484       rmap_i = rmap[i];
2485       irow_i = irow[i];
2486       jmax   = nrow[i];
2487       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2488     }
2489 #endif
2490 
2491     /* Update lens from offproc data */
2492     {
2493       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2494 
2495       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2496       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2497         sbuf1_i = sbuf1[pa[tmp2]];
2498         jmax    = sbuf1_i[0];
2499         ct1     = 2 * jmax + 1;
2500         ct2     = 0;
2501         rbuf2_i = rbuf2[tmp2];
2502         rbuf3_i = rbuf3[tmp2];
2503         for (j = 1; j <= jmax; j++) {
2504           is_no  = sbuf1_i[2 * j - 1];
2505           max1   = sbuf1_i[2 * j];
2506           lens_i = lens[is_no];
2507           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2508           rmap_i = rmap[is_no];
2509           for (k = 0; k < max1; k++, ct1++) {
2510 #if defined(PETSC_USE_CTABLE)
2511             PetscCall(PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row));
2512             row--;
2513             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2514 #else
2515             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2516 #endif
2517             max2 = rbuf2_i[ct1];
2518             for (l = 0; l < max2; l++, ct2++) {
2519               if (!allcolumns[is_no]) {
2520 #if defined(PETSC_USE_CTABLE)
2521                 PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol));
2522 #else
2523                 tcol = cmap_i[rbuf3_i[ct2]];
2524 #endif
2525                 if (tcol) lens_i[row]++;
2526               } else {         /* allcolumns */
2527                 lens_i[row]++; /* lens_i[row] += max2 ? */
2528               }
2529             }
2530           }
2531         }
2532       }
2533     }
2534     PetscCall(PetscFree(r_waits3));
2535     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2536     PetscCall(PetscFree(s_waits3));
2537 
2538     /* Create the submatrices */
2539     for (i = 0; i < ismax; i++) {
2540       PetscInt rbs, cbs;
2541 
2542       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2543       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2544 
2545       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2546       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2547 
2548       PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2549       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2550       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2551 
2552       /* create struct Mat_SubSppt and attached it to submat */
2553       PetscCall(PetscNew(&smat_i));
2554       subc            = (Mat_SeqAIJ *)submats[i]->data;
2555       subc->submatis1 = smat_i;
2556 
2557       smat_i->destroy          = submats[i]->ops->destroy;
2558       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2559       submats[i]->factortype   = C->factortype;
2560 
2561       smat_i->id          = i;
2562       smat_i->nrqs        = nrqs;
2563       smat_i->nrqr        = nrqr;
2564       smat_i->rbuf1       = rbuf1;
2565       smat_i->rbuf2       = rbuf2;
2566       smat_i->rbuf3       = rbuf3;
2567       smat_i->sbuf2       = sbuf2;
2568       smat_i->req_source2 = req_source2;
2569 
2570       smat_i->sbuf1 = sbuf1;
2571       smat_i->ptr   = ptr;
2572       smat_i->tmp   = tmp;
2573       smat_i->ctr   = ctr;
2574 
2575       smat_i->pa          = pa;
2576       smat_i->req_size    = req_size;
2577       smat_i->req_source1 = req_source1;
2578 
2579       smat_i->allcolumns = allcolumns[i];
2580       smat_i->singleis   = PETSC_FALSE;
2581       smat_i->row2proc   = row2proc[i];
2582       smat_i->rmap       = rmap[i];
2583       smat_i->cmap       = cmap[i];
2584     }
2585 
2586     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2587       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2588       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2589       PetscCall(MatSetType(submats[0], MATDUMMY));
2590 
2591       /* create struct Mat_SubSppt and attached it to submat */
2592       PetscCall(PetscNew(&smat_i));
2593       submats[0]->data = (void *)smat_i;
2594 
2595       smat_i->destroy          = submats[0]->ops->destroy;
2596       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2597       submats[0]->factortype   = C->factortype;
2598 
2599       smat_i->id          = 0;
2600       smat_i->nrqs        = nrqs;
2601       smat_i->nrqr        = nrqr;
2602       smat_i->rbuf1       = rbuf1;
2603       smat_i->rbuf2       = rbuf2;
2604       smat_i->rbuf3       = rbuf3;
2605       smat_i->sbuf2       = sbuf2;
2606       smat_i->req_source2 = req_source2;
2607 
2608       smat_i->sbuf1 = sbuf1;
2609       smat_i->ptr   = ptr;
2610       smat_i->tmp   = tmp;
2611       smat_i->ctr   = ctr;
2612 
2613       smat_i->pa          = pa;
2614       smat_i->req_size    = req_size;
2615       smat_i->req_source1 = req_source1;
2616 
2617       smat_i->allcolumns = PETSC_FALSE;
2618       smat_i->singleis   = PETSC_FALSE;
2619       smat_i->row2proc   = NULL;
2620       smat_i->rmap       = NULL;
2621       smat_i->cmap       = NULL;
2622     }
2623 
2624     if (ismax) PetscCall(PetscFree(lens[0]));
2625     PetscCall(PetscFree(lens));
2626     if (sbuf_aj) {
2627       PetscCall(PetscFree(sbuf_aj[0]));
2628       PetscCall(PetscFree(sbuf_aj));
2629     }
2630 
2631   } /* endof scall == MAT_INITIAL_MATRIX */
2632 
2633   /* Post recv matrix values */
2634   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2635   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2636   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2637   for (i = 0; i < nrqs; ++i) {
2638     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2639     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2640   }
2641 
2642   /* Allocate sending buffers for a->a, and send them off */
2643   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2644   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2645   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2646   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2647 
2648   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2649   {
2650     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2651     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2652     PetscInt     cend = C->cmap->rend;
2653     PetscInt    *b_j  = b->j;
2654     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2655 
2656     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2657     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2658     for (i = 0; i < nrqr; i++) {
2659       rbuf1_i   = rbuf1[i];
2660       sbuf_aa_i = sbuf_aa[i];
2661       ct1       = 2 * rbuf1_i[0] + 1;
2662       ct2       = 0;
2663       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2664         kmax = rbuf1_i[2 * j];
2665         for (k = 0; k < kmax; k++, ct1++) {
2666           row    = rbuf1_i[ct1] - rstart;
2667           nzA    = a_i[row + 1] - a_i[row];
2668           nzB    = b_i[row + 1] - b_i[row];
2669           ncols  = nzA + nzB;
2670           cworkB = b_j + b_i[row];
2671           vworkA = a_a + a_i[row];
2672           vworkB = b_a + b_i[row];
2673 
2674           /* load the column values for this row into vals*/
2675           vals = sbuf_aa_i + ct2;
2676 
2677           lwrite = 0;
2678           for (l = 0; l < nzB; l++) {
2679             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2680           }
2681           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2682           for (l = 0; l < nzB; l++) {
2683             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2684           }
2685 
2686           ct2 += ncols;
2687         }
2688       }
2689       PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2690     }
2691     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2692     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2693   }
2694 
2695   /* Assemble the matrices */
2696   /* First assemble the local rows */
2697   for (i = 0; i < ismax; i++) {
2698     row2proc_i = row2proc[i];
2699     subc       = (Mat_SeqAIJ *)submats[i]->data;
2700     imat_ilen  = subc->ilen;
2701     imat_j     = subc->j;
2702     imat_i     = subc->i;
2703     imat_a     = subc->a;
2704 
2705     if (!allcolumns[i]) cmap_i = cmap[i];
2706     rmap_i = rmap[i];
2707     irow_i = irow[i];
2708     jmax   = nrow[i];
2709     for (j = 0; j < jmax; j++) {
2710       row  = irow_i[j];
2711       proc = row2proc_i[j];
2712       if (proc == rank) {
2713         old_row = row;
2714 #if defined(PETSC_USE_CTABLE)
2715         PetscCall(PetscTableFind(rmap_i, row + 1, &row));
2716         row--;
2717 #else
2718         row = rmap_i[row];
2719 #endif
2720         ilen_row = imat_ilen[row];
2721         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2722         mat_i = imat_i[row];
2723         mat_a = imat_a + mat_i;
2724         mat_j = imat_j + mat_i;
2725         if (!allcolumns[i]) {
2726           for (k = 0; k < ncols; k++) {
2727 #if defined(PETSC_USE_CTABLE)
2728             PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol));
2729 #else
2730             tcol = cmap_i[cols[k]];
2731 #endif
2732             if (tcol) {
2733               *mat_j++ = tcol - 1;
2734               *mat_a++ = vals[k];
2735               ilen_row++;
2736             }
2737           }
2738         } else { /* allcolumns */
2739           for (k = 0; k < ncols; k++) {
2740             *mat_j++ = cols[k]; /* global col index! */
2741             *mat_a++ = vals[k];
2742             ilen_row++;
2743           }
2744         }
2745         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2746 
2747         imat_ilen[row] = ilen_row;
2748       }
2749     }
2750   }
2751 
2752   /* Now assemble the off proc rows */
2753   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2754   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2755     sbuf1_i = sbuf1[pa[tmp2]];
2756     jmax    = sbuf1_i[0];
2757     ct1     = 2 * jmax + 1;
2758     ct2     = 0;
2759     rbuf2_i = rbuf2[tmp2];
2760     rbuf3_i = rbuf3[tmp2];
2761     rbuf4_i = rbuf4[tmp2];
2762     for (j = 1; j <= jmax; j++) {
2763       is_no  = sbuf1_i[2 * j - 1];
2764       rmap_i = rmap[is_no];
2765       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2766       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2767       imat_ilen = subc->ilen;
2768       imat_j    = subc->j;
2769       imat_i    = subc->i;
2770       imat_a    = subc->a;
2771       max1      = sbuf1_i[2 * j];
2772       for (k = 0; k < max1; k++, ct1++) {
2773         row = sbuf1_i[ct1];
2774 #if defined(PETSC_USE_CTABLE)
2775         PetscCall(PetscTableFind(rmap_i, row + 1, &row));
2776         row--;
2777 #else
2778         row = rmap_i[row];
2779 #endif
2780         ilen  = imat_ilen[row];
2781         mat_i = imat_i[row];
2782         mat_a = imat_a + mat_i;
2783         mat_j = imat_j + mat_i;
2784         max2  = rbuf2_i[ct1];
2785         if (!allcolumns[is_no]) {
2786           for (l = 0; l < max2; l++, ct2++) {
2787 #if defined(PETSC_USE_CTABLE)
2788             PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol));
2789 #else
2790             tcol = cmap_i[rbuf3_i[ct2]];
2791 #endif
2792             if (tcol) {
2793               *mat_j++ = tcol - 1;
2794               *mat_a++ = rbuf4_i[ct2];
2795               ilen++;
2796             }
2797           }
2798         } else { /* allcolumns */
2799           for (l = 0; l < max2; l++, ct2++) {
2800             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2801             *mat_a++ = rbuf4_i[ct2];
2802             ilen++;
2803           }
2804         }
2805         imat_ilen[row] = ilen;
2806       }
2807     }
2808   }
2809 
2810   if (!iscsorted) { /* sort column indices of the rows */
2811     for (i = 0; i < ismax; i++) {
2812       subc      = (Mat_SeqAIJ *)submats[i]->data;
2813       imat_j    = subc->j;
2814       imat_i    = subc->i;
2815       imat_a    = subc->a;
2816       imat_ilen = subc->ilen;
2817 
2818       if (allcolumns[i]) continue;
2819       jmax = nrow[i];
2820       for (j = 0; j < jmax; j++) {
2821         mat_i = imat_i[j];
2822         mat_a = imat_a + mat_i;
2823         mat_j = imat_j + mat_i;
2824         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2825       }
2826     }
2827   }
2828 
2829   PetscCall(PetscFree(r_waits4));
2830   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2831   PetscCall(PetscFree(s_waits4));
2832 
2833   /* Restore the indices */
2834   for (i = 0; i < ismax; i++) {
2835     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2836     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2837   }
2838 
2839   for (i = 0; i < ismax; i++) {
2840     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2841     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2842   }
2843 
2844   /* Destroy allocated memory */
2845   if (sbuf_aa) {
2846     PetscCall(PetscFree(sbuf_aa[0]));
2847     PetscCall(PetscFree(sbuf_aa));
2848   }
2849   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2850 
2851   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2852   PetscCall(PetscFree(rbuf4));
2853 
2854   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2855   PetscFunctionReturn(0);
2856 }
2857 
2858 /*
2859  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2860  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2861  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2862  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2863  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2864  state, and needs to be "assembled" later by compressing B's column space.
2865 
2866  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2867  Following this call, C->A & C->B have been created, even if empty.
2868  */
2869 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2870 {
2871   /* If making this function public, change the error returned in this function away from _PLIB. */
2872   Mat_MPIAIJ     *aij;
2873   Mat_SeqAIJ     *Baij;
2874   PetscBool       seqaij, Bdisassembled;
2875   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2876   PetscScalar     v;
2877   const PetscInt *rowindices, *colindices;
2878 
2879   PetscFunctionBegin;
2880   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2881   if (A) {
2882     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2883     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2884     if (rowemb) {
2885       PetscCall(ISGetLocalSize(rowemb, &m));
2886       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2887     } else {
2888       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2889     }
2890     if (dcolemb) {
2891       PetscCall(ISGetLocalSize(dcolemb, &n));
2892       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2893     } else {
2894       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2895     }
2896   }
2897   if (B) {
2898     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2899     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2900     if (rowemb) {
2901       PetscCall(ISGetLocalSize(rowemb, &m));
2902       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2903     } else {
2904       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2905     }
2906     if (ocolemb) {
2907       PetscCall(ISGetLocalSize(ocolemb, &n));
2908       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2909     } else {
2910       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2911     }
2912   }
2913 
2914   aij = (Mat_MPIAIJ *)C->data;
2915   if (!aij->A) {
2916     /* Mimic parts of MatMPIAIJSetPreallocation() */
2917     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2918     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2919     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2920     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2921   }
2922   if (A) {
2923     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2924   } else {
2925     PetscCall(MatSetUp(aij->A));
2926   }
2927   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2928     /*
2929       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2930       need to "disassemble" B -- convert it to using C's global indices.
2931       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2932 
2933       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2934       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2935 
2936       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2937       At least avoid calling MatSetValues() and the implied searches?
2938     */
2939 
2940     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2941 #if defined(PETSC_USE_CTABLE)
2942       PetscCall(PetscTableDestroy(&aij->colmap));
2943 #else
2944       PetscCall(PetscFree(aij->colmap));
2945       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2946 #endif
2947       ngcol = 0;
2948       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2949       if (aij->garray) { PetscCall(PetscFree(aij->garray)); }
2950       PetscCall(VecDestroy(&aij->lvec));
2951       PetscCall(VecScatterDestroy(&aij->Mvctx));
2952     }
2953     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2954     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2955   }
2956   Bdisassembled = PETSC_FALSE;
2957   if (!aij->B) {
2958     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2959     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2960     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2961     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2962     Bdisassembled = PETSC_TRUE;
2963   }
2964   if (B) {
2965     Baij = (Mat_SeqAIJ *)B->data;
2966     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2967       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2968       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2969       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2970       PetscCall(PetscFree(nz));
2971     }
2972 
2973     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2974     shift      = rend - rstart;
2975     count      = 0;
2976     rowindices = NULL;
2977     colindices = NULL;
2978     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2979     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2980     for (i = 0; i < B->rmap->n; i++) {
2981       PetscInt row;
2982       row = i;
2983       if (rowindices) row = rowindices[i];
2984       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2985         col = Baij->j[count];
2986         if (colindices) col = colindices[col];
2987         if (Bdisassembled && col >= rstart) col += shift;
2988         v = Baij->a[count];
2989         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2990         ++count;
2991       }
2992     }
2993     /* No assembly for aij->B is necessary. */
2994     /* FIXME: set aij->B's nonzerostate correctly. */
2995   } else {
2996     PetscCall(MatSetUp(aij->B));
2997   }
2998   C->preallocated  = PETSC_TRUE;
2999   C->was_assembled = PETSC_FALSE;
3000   C->assembled     = PETSC_FALSE;
3001   /*
3002       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3003       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3004    */
3005   PetscFunctionReturn(0);
3006 }
3007 
3008 /*
3009   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3010  */
3011 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
3012 {
3013   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
3014 
3015   PetscFunctionBegin;
3016   PetscValidPointer(A, 2);
3017   PetscValidPointer(B, 3);
3018   /* FIXME: make sure C is assembled */
3019   *A = aij->A;
3020   *B = aij->B;
3021   /* Note that we don't incref *A and *B, so be careful! */
3022   PetscFunctionReturn(0);
3023 }
3024 
3025 /*
3026   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3027   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3028 */
3029 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3030 {
3031   PetscMPIInt size, flag;
3032   PetscInt    i, ii, cismax, ispar;
3033   Mat        *A, *B;
3034   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3035 
3036   PetscFunctionBegin;
3037   if (!ismax) PetscFunctionReturn(0);
3038 
3039   for (i = 0, cismax = 0; i < ismax; ++i) {
3040     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3041     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3042     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3043     if (size > 1) ++cismax;
3044   }
3045 
3046   /*
3047      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3048      ispar counts the number of parallel ISs across C's comm.
3049   */
3050   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3051   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3052     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3053     PetscFunctionReturn(0);
3054   }
3055 
3056   /* if (ispar) */
3057   /*
3058     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3059     These are used to extract the off-diag portion of the resulting parallel matrix.
3060     The row IS for the off-diag portion is the same as for the diag portion,
3061     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3062   */
3063   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3064   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3065   for (i = 0, ii = 0; i < ismax; ++i) {
3066     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3067     if (size > 1) {
3068       /*
3069          TODO: This is the part that's ***NOT SCALABLE***.
3070          To fix this we need to extract just the indices of C's nonzero columns
3071          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3072          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3073          to be done without serializing on the IS list, so, most likely, it is best
3074          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3075       */
3076       PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii])));
3077       /* Now we have to
3078          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3079              were sorted on each rank, concatenated they might no longer be sorted;
3080          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3081              indices in the nondecreasing order to the original index positions.
3082          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3083       */
3084       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3085       PetscCall(ISSort(ciscol[ii]));
3086       ++ii;
3087     }
3088   }
3089   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3090   for (i = 0, ii = 0; i < ismax; ++i) {
3091     PetscInt        j, issize;
3092     const PetscInt *indices;
3093 
3094     /*
3095        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3096      */
3097     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3098     PetscCall(ISSort(isrow[i]));
3099     PetscCall(ISGetLocalSize(isrow[i], &issize));
3100     PetscCall(ISGetIndices(isrow[i], &indices));
3101     for (j = 1; j < issize; ++j) {
3102       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3103     }
3104     PetscCall(ISRestoreIndices(isrow[i], &indices));
3105     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3106     PetscCall(ISSort(iscol[i]));
3107     PetscCall(ISGetLocalSize(iscol[i], &issize));
3108     PetscCall(ISGetIndices(iscol[i], &indices));
3109     for (j = 1; j < issize; ++j) {
3110       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3111     }
3112     PetscCall(ISRestoreIndices(iscol[i], &indices));
3113     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3114     if (size > 1) {
3115       cisrow[ii] = isrow[i];
3116       ++ii;
3117     }
3118   }
3119   /*
3120     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3121     array of sequential matrices underlying the resulting parallel matrices.
3122     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3123     contain duplicates.
3124 
3125     There are as many diag matrices as there are original index sets. There are only as many parallel
3126     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3127 
3128     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3129     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3130       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3131       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3132       with A[i] and B[ii] extracted from the corresponding MPI submat.
3133     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3134       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3135       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3136       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3137       values into A[i] and B[ii] sitting inside the corresponding submat.
3138     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3139       A[i], B[ii], AA[i] or BB[ii] matrices.
3140   */
3141   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3142   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3143 
3144   /* Now obtain the sequential A and B submatrices separately. */
3145   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3146   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3147   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3148 
3149   /*
3150     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3151     matrices A & B have been extracted directly into the parallel matrices containing them, or
3152     simply into the sequential matrix identical with the corresponding A (if size == 1).
3153     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3154     to have the same sparsity pattern.
3155     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3156     must be constructed for C. This is done by setseqmat(s).
3157   */
3158   for (i = 0, ii = 0; i < ismax; ++i) {
3159     /*
3160        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3161        That way we can avoid sorting and computing permutations when reusing.
3162        To this end:
3163         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3164         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3165           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3166     */
3167     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3168 
3169     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3170     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3171     if (size > 1) {
3172       if (scall == MAT_INITIAL_MATRIX) {
3173         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3174         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3175         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3176         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3177         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3178       }
3179       /*
3180         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3181       */
3182       {
3183         Mat AA = A[i], BB = B[ii];
3184 
3185         if (AA || BB) {
3186           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3187           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3188           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3189         }
3190         PetscCall(MatDestroy(&AA));
3191       }
3192       PetscCall(ISDestroy(ciscol + ii));
3193       PetscCall(ISDestroy(ciscol_p + ii));
3194       ++ii;
3195     } else { /* if (size == 1) */
3196       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3197       if (isrow_p[i] || iscol_p[i]) {
3198         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3199         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3200         /* Otherwise A is extracted straight into (*submats)[i]. */
3201         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3202         PetscCall(MatDestroy(A + i));
3203       } else (*submat)[i] = A[i];
3204     }
3205     PetscCall(ISDestroy(&isrow_p[i]));
3206     PetscCall(ISDestroy(&iscol_p[i]));
3207   }
3208   PetscCall(PetscFree2(cisrow, ciscol));
3209   PetscCall(PetscFree2(isrow_p, iscol_p));
3210   PetscCall(PetscFree(ciscol_p));
3211   PetscCall(PetscFree(A));
3212   PetscCall(MatDestroySubMatrices(cismax, &B));
3213   PetscFunctionReturn(0);
3214 }
3215 
3216 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3217 {
3218   PetscFunctionBegin;
3219   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3220   PetscFunctionReturn(0);
3221 }
3222