xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision e874ec00d637a86419bb2cc912cf88b33e5547ef)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
22 {
23   PetscInt i;
24 
25   PetscFunctionBegin;
26   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
27   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
28   PetscFunctionReturn(0);
29 }
30 
31 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
32 {
33   PetscInt i;
34 
35   PetscFunctionBegin;
36   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
37   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
38   PetscFunctionReturn(0);
39 }
40 
41 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
42 {
43   MPI_Comm        comm;
44   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
45   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
46   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
47   const PetscInt *indices_i, **indices;
48   PetscLayout     rmap;
49   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
50   PetscSF         sf;
51   PetscSFNode    *remote;
52 
53   PetscFunctionBegin;
54   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
55   PetscCallMPI(MPI_Comm_rank(comm, &rank));
56   PetscCallMPI(MPI_Comm_size(comm, &size));
57   /* get row map to determine where rows should be going */
58   PetscCall(MatGetLayouts(mat, &rmap, NULL));
59   /* retrieve IS data and put all together so that we
60    * can optimize communication
61    *  */
62   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
63   for (i = 0, tlength = 0; i < nidx; i++) {
64     PetscCall(ISGetLocalSize(is[i], &length[i]));
65     tlength += length[i];
66     PetscCall(ISGetIndices(is[i], &indices[i]));
67   }
68   /* find these rows on remote processors */
69   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
70   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
71   nrrows = 0;
72   for (i = 0; i < nidx; i++) {
73     length_i  = length[i];
74     indices_i = indices[i];
75     for (j = 0; j < length_i; j++) {
76       owner = -1;
77       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
78       /* remote processors */
79       if (owner != rank) {
80         tosizes_temp[owner]++;               /* number of rows to owner */
81         rrow_ranks[nrrows]   = owner;        /* processor */
82         rrow_isids[nrrows]   = i;            /* is id */
83         remoterows[nrrows++] = indices_i[j]; /* row */
84       }
85     }
86     PetscCall(ISRestoreIndices(is[i], &indices[i]));
87   }
88   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
89   /* test if we need to exchange messages
90    * generally speaking, we do not need to exchange
91    * data when overlap is 1
92    * */
93   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
94   /* we do not have any messages
95    * It usually corresponds to overlap 1
96    * */
97   if (!reducednrrows) {
98     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
99     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
100     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
101     PetscFunctionReturn(0);
102   }
103   nto = 0;
104   /* send sizes and ranks for building a two-sided communcation */
105   for (i = 0; i < size; i++) {
106     if (tosizes_temp[i]) {
107       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
108       tosizes_temp[i]  = nto;                 /* a map from processor to index */
109       toranks[nto++]   = i;                   /* processor */
110     }
111   }
112   PetscCall(PetscMalloc1(nto + 1, &toffsets));
113   toffsets[0] = 0;
114   for (i = 0; i < nto; i++) {
115     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
116     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
117   }
118   /* send information to other processors */
119   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
120   nrecvrows = 0;
121   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
122   PetscCall(PetscMalloc1(nrecvrows, &remote));
123   nrecvrows = 0;
124   for (i = 0; i < nfrom; i++) {
125     for (j = 0; j < fromsizes[2 * i]; j++) {
126       remote[nrecvrows].rank    = fromranks[i];
127       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
128     }
129   }
130   PetscCall(PetscSFCreate(comm, &sf));
131   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
132   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
133   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
134   PetscCall(PetscSFSetFromOptions(sf));
135   /* message pair <no of is, row>  */
136   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
137   for (i = 0; i < nrrows; i++) {
138     owner                 = rrow_ranks[i];       /* processor */
139     j                     = tosizes_temp[owner]; /* index */
140     todata[toffsets[j]++] = rrow_isids[i];
141     todata[toffsets[j]++] = remoterows[i];
142   }
143   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
144   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
145   PetscCall(PetscFree(toffsets));
146   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
147   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
148   PetscCall(PetscSFDestroy(&sf));
149   /* send rows belonging to the remote so that then we could get the overlapping data back */
150   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
151   PetscCall(PetscFree2(todata, fromdata));
152   PetscCall(PetscFree(fromsizes));
153   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
154   PetscCall(PetscFree(fromranks));
155   nrecvrows = 0;
156   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
157   PetscCall(PetscCalloc1(nrecvrows, &todata));
158   PetscCall(PetscMalloc1(nrecvrows, &remote));
159   nrecvrows = 0;
160   for (i = 0; i < nto; i++) {
161     for (j = 0; j < tosizes[2 * i]; j++) {
162       remote[nrecvrows].rank    = toranks[i];
163       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
164     }
165   }
166   PetscCall(PetscSFCreate(comm, &sf));
167   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
168   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
169   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
170   PetscCall(PetscSFSetFromOptions(sf));
171   /* overlap communication and computation */
172   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
173   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
174   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
175   PetscCall(PetscSFDestroy(&sf));
176   PetscCall(PetscFree2(sbdata, sbsizes));
177   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
178   PetscCall(PetscFree(toranks));
179   PetscCall(PetscFree(tosizes));
180   PetscCall(PetscFree(todata));
181   PetscFunctionReturn(0);
182 }
183 
184 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
185 {
186   PetscInt       *isz, isz_i, i, j, is_id, data_size;
187   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
188   const PetscInt *indices_i_temp;
189   MPI_Comm       *iscomms;
190 
191   PetscFunctionBegin;
192   max_lsize = 0;
193   PetscCall(PetscMalloc1(nidx, &isz));
194   for (i = 0; i < nidx; i++) {
195     PetscCall(ISGetLocalSize(is[i], &lsize));
196     max_lsize = lsize > max_lsize ? lsize : max_lsize;
197     isz[i]    = lsize;
198   }
199   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
200   for (i = 0; i < nidx; i++) {
201     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
202     PetscCall(ISGetIndices(is[i], &indices_i_temp));
203     PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]));
204     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
205     PetscCall(ISDestroy(&is[i]));
206   }
207   /* retrieve information to get row id and its overlap */
208   for (i = 0; i < nrecvs;) {
209     is_id     = recvdata[i++];
210     data_size = recvdata[i++];
211     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
212     isz_i     = isz[is_id];
213     for (j = 0; j < data_size; j++) {
214       col                = recvdata[i++];
215       indices_i[isz_i++] = col;
216     }
217     isz[is_id] = isz_i;
218   }
219   /* remove duplicate entities */
220   for (i = 0; i < nidx; i++) {
221     indices_i = indices_temp + (max_lsize + nrecvs) * i;
222     isz_i     = isz[i];
223     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
224     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
225     PetscCall(PetscCommDestroy(&iscomms[i]));
226   }
227   PetscCall(PetscFree(isz));
228   PetscCall(PetscFree2(indices_temp, iscomms));
229   PetscFunctionReturn(0);
230 }
231 
232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
233 {
234   PetscLayout     rmap, cmap;
235   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
236   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
237   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
238   const PetscInt *gcols, *ai, *aj, *bi, *bj;
239   Mat             amat, bmat;
240   PetscMPIInt     rank;
241   PetscBool       done;
242   MPI_Comm        comm;
243 
244   PetscFunctionBegin;
245   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
246   PetscCallMPI(MPI_Comm_rank(comm, &rank));
247   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
248   /* Even if the mat is symmetric, we still assume it is not symmetric */
249   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
250   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
251   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
252   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
253   /* total number of nonzero values is used to estimate the memory usage in the next step */
254   tnz = ai[an] + bi[bn];
255   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
256   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
257   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
258   /* to find the longest message */
259   max_fszs = 0;
260   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
261   /* better way to estimate number of nonzero in the mat??? */
262   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
263   for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
264   rows_pos  = 0;
265   totalrows = 0;
266   for (i = 0; i < nfrom; i++) {
267     PetscCall(PetscArrayzero(rows_pos_i, nidx));
268     /* group data together */
269     for (j = 0; j < fromsizes[2 * i]; j += 2) {
270       is_id                       = fromrows[rows_pos++]; /* no of is */
271       rows_i                      = rows_data[is_id];
272       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
273     }
274     /* estimate a space to avoid multiple allocations  */
275     for (j = 0; j < nidx; j++) {
276       indvc_ij = 0;
277       rows_i   = rows_data[j];
278       for (l = 0; l < rows_pos_i[j]; l++) {
279         row   = rows_i[l] - rstart;
280         start = ai[row];
281         end   = ai[row + 1];
282         for (k = start; k < end; k++) { /* Amat */
283           col                     = aj[k] + cstart;
284           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
285         }
286         start = bi[row];
287         end   = bi[row + 1];
288         for (k = start; k < end; k++) { /* Bmat */
289           col                     = gcols[bj[k]];
290           indices_tmp[indvc_ij++] = col;
291         }
292       }
293       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
294       indv_counts[i * nidx + j] = indvc_ij;
295       totalrows += indvc_ij;
296     }
297   }
298   /* message triple <no of is, number of rows, rows> */
299   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
300   totalrows = 0;
301   rows_pos  = 0;
302   /* use this code again */
303   for (i = 0; i < nfrom; i++) {
304     PetscCall(PetscArrayzero(rows_pos_i, nidx));
305     for (j = 0; j < fromsizes[2 * i]; j += 2) {
306       is_id                       = fromrows[rows_pos++];
307       rows_i                      = rows_data[is_id];
308       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
309     }
310     /* add data  */
311     for (j = 0; j < nidx; j++) {
312       if (!indv_counts[i * nidx + j]) continue;
313       indvc_ij            = 0;
314       sbdata[totalrows++] = j;
315       sbdata[totalrows++] = indv_counts[i * nidx + j];
316       sbsizes[2 * i] += 2;
317       rows_i = rows_data[j];
318       for (l = 0; l < rows_pos_i[j]; l++) {
319         row   = rows_i[l] - rstart;
320         start = ai[row];
321         end   = ai[row + 1];
322         for (k = start; k < end; k++) { /* Amat */
323           col                     = aj[k] + cstart;
324           indices_tmp[indvc_ij++] = col;
325         }
326         start = bi[row];
327         end   = bi[row + 1];
328         for (k = start; k < end; k++) { /* Bmat */
329           col                     = gcols[bj[k]];
330           indices_tmp[indvc_ij++] = col;
331         }
332       }
333       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
334       sbsizes[2 * i] += indvc_ij;
335       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
336       totalrows += indvc_ij;
337     }
338   }
339   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
340   offsets[0] = 0;
341   for (i = 0; i < nfrom; i++) {
342     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
343     sbsizes[2 * i + 1] = offsets[i];
344   }
345   PetscCall(PetscFree(offsets));
346   if (sbrowsizes) *sbrowsizes = sbsizes;
347   if (sbrows) *sbrows = sbdata;
348   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
349   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
350   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
351   PetscFunctionReturn(0);
352 }
353 
354 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
355 {
356   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
357   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
358   PetscInt        lsize, lsize_tmp;
359   PetscMPIInt     rank, owner;
360   Mat             amat, bmat;
361   PetscBool       done;
362   PetscLayout     cmap, rmap;
363   MPI_Comm        comm;
364 
365   PetscFunctionBegin;
366   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
367   PetscCallMPI(MPI_Comm_rank(comm, &rank));
368   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
369   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
370   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
371   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
372   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
373   /* is it a safe way to compute number of nonzero values ? */
374   tnz = ai[an] + bi[bn];
375   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
376   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
377   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
378   /* it is a better way to estimate memory than the old implementation
379    * where global size of matrix is used
380    * */
381   PetscCall(PetscMalloc1(tnz, &indices_temp));
382   for (i = 0; i < nidx; i++) {
383     MPI_Comm iscomm;
384 
385     PetscCall(ISGetLocalSize(is[i], &lsize));
386     PetscCall(ISGetIndices(is[i], &indices));
387     lsize_tmp = 0;
388     for (j = 0; j < lsize; j++) {
389       owner = -1;
390       row   = indices[j];
391       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
392       if (owner != rank) continue;
393       /* local number */
394       row -= rstart;
395       start = ai[row];
396       end   = ai[row + 1];
397       for (k = start; k < end; k++) { /* Amat */
398         col                       = aj[k] + cstart;
399         indices_temp[lsize_tmp++] = col;
400       }
401       start = bi[row];
402       end   = bi[row + 1];
403       for (k = start; k < end; k++) { /* Bmat */
404         col                       = gcols[bj[k]];
405         indices_temp[lsize_tmp++] = col;
406       }
407     }
408     PetscCall(ISRestoreIndices(is[i], &indices));
409     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
410     PetscCall(ISDestroy(&is[i]));
411     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
412     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
413     PetscCall(PetscCommDestroy(&iscomm));
414   }
415   PetscCall(PetscFree(indices_temp));
416   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
418   PetscFunctionReturn(0);
419 }
420 
421 /*
422   Sample message format:
423   If a processor A wants processor B to process some elements corresponding
424   to index sets is[1],is[5]
425   mesg [0] = 2   (no of index sets in the mesg)
426   -----------
427   mesg [1] = 1 => is[1]
428   mesg [2] = sizeof(is[1]);
429   -----------
430   mesg [3] = 5  => is[5]
431   mesg [4] = sizeof(is[5]);
432   -----------
433   mesg [5]
434   mesg [n]  datas[1]
435   -----------
436   mesg[n+1]
437   mesg[m]  data(is[5])
438   -----------
439 
440   Notes:
441   nrqs - no of requests sent (or to be sent out)
442   nrqr - no of requests received (which have to be or which have been processed)
443 */
444 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
445 {
446   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
447   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
448   const PetscInt **idx, *idx_i;
449   PetscInt        *n, **data, len;
450 #if defined(PETSC_USE_CTABLE)
451   PetscHMapI *table_data, table_data_i;
452   PetscInt   *tdata, tcount, tcount_max;
453 #else
454   PetscInt *data_i, *d_p;
455 #endif
456   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
457   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
458   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
459   PetscBT     *table;
460   MPI_Comm     comm;
461   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
462   MPI_Status  *recv_status;
463   MPI_Comm    *iscomms;
464   char        *t_p;
465 
466   PetscFunctionBegin;
467   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
468   size = c->size;
469   rank = c->rank;
470   M    = C->rmap->N;
471 
472   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
473   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
474 
475   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
476 
477   for (i = 0; i < imax; i++) {
478     PetscCall(ISGetIndices(is[i], &idx[i]));
479     PetscCall(ISGetLocalSize(is[i], &n[i]));
480   }
481 
482   /* evaluate communication - mesg to who,length of mesg, and buffer space
483      required. Based on this, buffers are allocated, and data copied into them  */
484   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
485   for (i = 0; i < imax; i++) {
486     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
487     idx_i = idx[i];
488     len   = n[i];
489     for (j = 0; j < len; j++) {
490       row = idx_i[j];
491       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
492       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
493       w4[proc]++;
494     }
495     for (j = 0; j < size; j++) {
496       if (w4[j]) {
497         w1[j] += w4[j];
498         w3[j]++;
499       }
500     }
501   }
502 
503   nrqs     = 0; /* no of outgoing messages */
504   msz      = 0; /* total mesg length (for all proc */
505   w1[rank] = 0; /* no mesg sent to intself */
506   w3[rank] = 0;
507   for (i = 0; i < size; i++) {
508     if (w1[i]) {
509       w2[i] = 1;
510       nrqs++;
511     } /* there exists a message to proc i */
512   }
513   /* pa - is list of processors to communicate with */
514   PetscCall(PetscMalloc1(nrqs, &pa));
515   for (i = 0, j = 0; i < size; i++) {
516     if (w1[i]) {
517       pa[j] = i;
518       j++;
519     }
520   }
521 
522   /* Each message would have a header = 1 + 2*(no of IS) + data */
523   for (i = 0; i < nrqs; i++) {
524     j = pa[i];
525     w1[j] += w2[j] + 2 * w3[j];
526     msz += w1[j];
527   }
528 
529   /* Determine the number of messages to expect, their lengths, from from-ids */
530   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
531   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
532 
533   /* Now post the Irecvs corresponding to these messages */
534   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
535 
536   /* Allocate Memory for outgoing messages */
537   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
538   PetscCall(PetscArrayzero(outdat, size));
539   PetscCall(PetscArrayzero(ptr, size));
540 
541   {
542     PetscInt *iptr = tmp, ict = 0;
543     for (i = 0; i < nrqs; i++) {
544       j = pa[i];
545       iptr += ict;
546       outdat[j] = iptr;
547       ict       = w1[j];
548     }
549   }
550 
551   /* Form the outgoing messages */
552   /* plug in the headers */
553   for (i = 0; i < nrqs; i++) {
554     j            = pa[i];
555     outdat[j][0] = 0;
556     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
557     ptr[j] = outdat[j] + 2 * w3[j] + 1;
558   }
559 
560   /* Memory for doing local proc's work */
561   {
562     PetscInt M_BPB_imax = 0;
563 #if defined(PETSC_USE_CTABLE)
564     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
565     PetscCall(PetscMalloc1(imax, &table_data));
566     for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
567     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
568     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
569 #else
570     PetscInt Mimax = 0;
571     PetscCall(PetscIntMultError(M, imax, &Mimax));
572     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
573     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
574     for (i = 0; i < imax; i++) {
575       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
576       data[i]  = d_p + M * i;
577     }
578 #endif
579   }
580 
581   /* Parse the IS and update local tables and the outgoing buf with the data */
582   {
583     PetscInt n_i, isz_i, *outdat_j, ctr_j;
584     PetscBT  table_i;
585 
586     for (i = 0; i < imax; i++) {
587       PetscCall(PetscArrayzero(ctr, size));
588       n_i     = n[i];
589       table_i = table[i];
590       idx_i   = idx[i];
591 #if defined(PETSC_USE_CTABLE)
592       table_data_i = table_data[i];
593 #else
594       data_i = data[i];
595 #endif
596       isz_i = isz[i];
597       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
598         row = idx_i[j];
599         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
600         if (proc != rank) { /* copy to the outgoing buffer */
601           ctr[proc]++;
602           *ptr[proc] = row;
603           ptr[proc]++;
604         } else if (!PetscBTLookupSet(table_i, row)) {
605 #if defined(PETSC_USE_CTABLE)
606           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
607 #else
608           data_i[isz_i] = row; /* Update the local table */
609 #endif
610           isz_i++;
611         }
612       }
613       /* Update the headers for the current IS */
614       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
615         if ((ctr_j = ctr[j])) {
616           outdat_j            = outdat[j];
617           k                   = ++outdat_j[0];
618           outdat_j[2 * k]     = ctr_j;
619           outdat_j[2 * k - 1] = i;
620         }
621       }
622       isz[i] = isz_i;
623     }
624   }
625 
626   /*  Now  post the sends */
627   PetscCall(PetscMalloc1(nrqs, &s_waits1));
628   for (i = 0; i < nrqs; ++i) {
629     j = pa[i];
630     PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
631   }
632 
633   /* No longer need the original indices */
634   PetscCall(PetscMalloc1(imax, &iscomms));
635   for (i = 0; i < imax; ++i) {
636     PetscCall(ISRestoreIndices(is[i], idx + i));
637     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
638   }
639   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
640 
641   for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
642 
643     /* Do Local work */
644 #if defined(PETSC_USE_CTABLE)
645   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
646 #else
647   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
648 #endif
649 
650   /* Receive messages */
651   PetscCall(PetscMalloc1(nrqr, &recv_status));
652   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
653   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
654 
655   /* Phase 1 sends are complete - deallocate buffers */
656   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
657   PetscCall(PetscFree4(w1, w2, w3, w4));
658 
659   PetscCall(PetscMalloc1(nrqr, &xdata));
660   PetscCall(PetscMalloc1(nrqr, &isz1));
661   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
662   PetscCall(PetscFree(rbuf[0]));
663   PetscCall(PetscFree(rbuf));
664 
665   /* Send the data back */
666   /* Do a global reduction to know the buffer space req for incoming messages */
667   {
668     PetscMPIInt *rw1;
669 
670     PetscCall(PetscCalloc1(size, &rw1));
671 
672     for (i = 0; i < nrqr; ++i) {
673       proc = recv_status[i].MPI_SOURCE;
674 
675       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
676       rw1[proc] = isz1[i];
677     }
678     PetscCall(PetscFree(onodes1));
679     PetscCall(PetscFree(olengths1));
680 
681     /* Determine the number of messages to expect, their lengths, from from-ids */
682     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
683     PetscCall(PetscFree(rw1));
684   }
685   /* Now post the Irecvs corresponding to these messages */
686   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
687 
688   /* Now  post the sends */
689   PetscCall(PetscMalloc1(nrqr, &s_waits2));
690   for (i = 0; i < nrqr; ++i) {
691     j = recv_status[i].MPI_SOURCE;
692     PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
693   }
694 
695   /* receive work done on other processors */
696   {
697     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
698     PetscMPIInt idex;
699     PetscBT     table_i;
700 
701     for (i = 0; i < nrqs; ++i) {
702       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
703       /* Process the message */
704       rbuf2_i = rbuf2[idex];
705       ct1     = 2 * rbuf2_i[0] + 1;
706       jmax    = rbuf2[idex][0];
707       for (j = 1; j <= jmax; j++) {
708         max     = rbuf2_i[2 * j];
709         is_no   = rbuf2_i[2 * j - 1];
710         isz_i   = isz[is_no];
711         table_i = table[is_no];
712 #if defined(PETSC_USE_CTABLE)
713         table_data_i = table_data[is_no];
714 #else
715         data_i = data[is_no];
716 #endif
717         for (k = 0; k < max; k++, ct1++) {
718           row = rbuf2_i[ct1];
719           if (!PetscBTLookupSet(table_i, row)) {
720 #if defined(PETSC_USE_CTABLE)
721             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
722 #else
723             data_i[isz_i] = row;
724 #endif
725             isz_i++;
726           }
727         }
728         isz[is_no] = isz_i;
729       }
730     }
731 
732     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
733   }
734 
735 #if defined(PETSC_USE_CTABLE)
736   tcount_max = 0;
737   for (i = 0; i < imax; ++i) {
738     table_data_i = table_data[i];
739     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
740     if (tcount_max < tcount) tcount_max = tcount;
741   }
742   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
743 #endif
744 
745   for (i = 0; i < imax; ++i) {
746 #if defined(PETSC_USE_CTABLE)
747     PetscHashIter tpos;
748 
749     table_data_i = table_data[i];
750     PetscHashIterBegin(table_data_i, tpos);
751     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
752       PetscHashIterGetKey(table_data_i, tpos, k);
753       PetscHashIterGetVal(table_data_i, tpos, j);
754       PetscHashIterNext(table_data_i, tpos);
755       tdata[--j] = --k;
756     }
757     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
758 #else
759     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
760 #endif
761     PetscCall(PetscCommDestroy(&iscomms[i]));
762   }
763 
764   PetscCall(PetscFree(iscomms));
765   PetscCall(PetscFree(onodes2));
766   PetscCall(PetscFree(olengths2));
767 
768   PetscCall(PetscFree(pa));
769   PetscCall(PetscFree(rbuf2[0]));
770   PetscCall(PetscFree(rbuf2));
771   PetscCall(PetscFree(s_waits1));
772   PetscCall(PetscFree(r_waits1));
773   PetscCall(PetscFree(s_waits2));
774   PetscCall(PetscFree(r_waits2));
775   PetscCall(PetscFree(recv_status));
776   if (xdata) {
777     PetscCall(PetscFree(xdata[0]));
778     PetscCall(PetscFree(xdata));
779   }
780   PetscCall(PetscFree(isz1));
781 #if defined(PETSC_USE_CTABLE)
782   for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
783   PetscCall(PetscFree(table_data));
784   PetscCall(PetscFree(tdata));
785   PetscCall(PetscFree4(table, data, isz, t_p));
786 #else
787   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
788 #endif
789   PetscFunctionReturn(0);
790 }
791 
792 /*
793    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
794        the work on the local processor.
795 
796      Inputs:
797       C      - MAT_MPIAIJ;
798       imax - total no of index sets processed at a time;
799       table  - an array of char - size = m bits.
800 
801      Output:
802       isz    - array containing the count of the solution elements corresponding
803                to each index set;
804       data or table_data  - pointer to the solutions
805 */
806 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
807 {
808   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
809   Mat         A = c->A, B = c->B;
810   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
811   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
812   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
813   PetscBT     table_i;
814 #if defined(PETSC_USE_CTABLE)
815   PetscHMapI    table_data_i;
816   PetscHashIter tpos;
817   PetscInt      tcount, *tdata;
818 #else
819   PetscInt *data_i;
820 #endif
821 
822   PetscFunctionBegin;
823   rstart = C->rmap->rstart;
824   cstart = C->cmap->rstart;
825   ai     = a->i;
826   aj     = a->j;
827   bi     = b->i;
828   bj     = b->j;
829   garray = c->garray;
830 
831   for (i = 0; i < imax; i++) {
832 #if defined(PETSC_USE_CTABLE)
833     /* copy existing entries of table_data_i into tdata[] */
834     table_data_i = table_data[i];
835     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
836     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
837 
838     PetscCall(PetscMalloc1(tcount, &tdata));
839     PetscHashIterBegin(table_data_i, tpos);
840     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
841       PetscHashIterGetKey(table_data_i, tpos, row);
842       PetscHashIterGetVal(table_data_i, tpos, j);
843       PetscHashIterNext(table_data_i, tpos);
844       tdata[--j] = --row;
845       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
846     }
847 #else
848     data_i = data[i];
849 #endif
850     table_i = table[i];
851     isz_i   = isz[i];
852     max     = isz[i];
853 
854     for (j = 0; j < max; j++) {
855 #if defined(PETSC_USE_CTABLE)
856       row = tdata[j] - rstart;
857 #else
858       row = data_i[j] - rstart;
859 #endif
860       start = ai[row];
861       end   = ai[row + 1];
862       for (k = start; k < end; k++) { /* Amat */
863         val = aj[k] + cstart;
864         if (!PetscBTLookupSet(table_i, val)) {
865 #if defined(PETSC_USE_CTABLE)
866           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
867 #else
868           data_i[isz_i] = val;
869 #endif
870           isz_i++;
871         }
872       }
873       start = bi[row];
874       end   = bi[row + 1];
875       for (k = start; k < end; k++) { /* Bmat */
876         val = garray[bj[k]];
877         if (!PetscBTLookupSet(table_i, val)) {
878 #if defined(PETSC_USE_CTABLE)
879           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
880 #else
881           data_i[isz_i] = val;
882 #endif
883           isz_i++;
884         }
885       }
886     }
887     isz[i] = isz_i;
888 
889 #if defined(PETSC_USE_CTABLE)
890     PetscCall(PetscFree(tdata));
891 #endif
892   }
893   PetscFunctionReturn(0);
894 }
895 
896 /*
897       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
898          and return the output
899 
900          Input:
901            C    - the matrix
902            nrqr - no of messages being processed.
903            rbuf - an array of pointers to the received requests
904 
905          Output:
906            xdata - array of messages to be sent back
907            isz1  - size of each message
908 
909   For better efficiency perhaps we should malloc separately each xdata[i],
910 then if a remalloc is required we need only copy the data for that one row
911 rather then all previous rows as it is now where a single large chunk of
912 memory is used.
913 
914 */
915 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
916 {
917   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
918   Mat         A = c->A, B = c->B;
919   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
920   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
921   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
922   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
923   PetscInt   *rbuf_i, kmax, rbuf_0;
924   PetscBT     xtable;
925 
926   PetscFunctionBegin;
927   m      = C->rmap->N;
928   rstart = C->rmap->rstart;
929   cstart = C->cmap->rstart;
930   ai     = a->i;
931   aj     = a->j;
932   bi     = b->i;
933   bj     = b->j;
934   garray = c->garray;
935 
936   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
937     rbuf_i = rbuf[i];
938     rbuf_0 = rbuf_i[0];
939     ct += rbuf_0;
940     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
941   }
942 
943   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
944   else max1 = 1;
945   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
946   if (nrqr) {
947     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
948     ++no_malloc;
949   }
950   PetscCall(PetscBTCreate(m, &xtable));
951   PetscCall(PetscArrayzero(isz1, nrqr));
952 
953   ct3 = 0;
954   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
955     rbuf_i = rbuf[i];
956     rbuf_0 = rbuf_i[0];
957     ct1    = 2 * rbuf_0 + 1;
958     ct2    = ct1;
959     ct3 += ct1;
960     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
961       PetscCall(PetscBTMemzero(m, xtable));
962       oct2 = ct2;
963       kmax = rbuf_i[2 * j];
964       for (k = 0; k < kmax; k++, ct1++) {
965         row = rbuf_i[ct1];
966         if (!PetscBTLookupSet(xtable, row)) {
967           if (!(ct3 < mem_estimate)) {
968             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
969             PetscCall(PetscMalloc1(new_estimate, &tmp));
970             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
971             PetscCall(PetscFree(xdata[0]));
972             xdata[0]     = tmp;
973             mem_estimate = new_estimate;
974             ++no_malloc;
975             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
976           }
977           xdata[i][ct2++] = row;
978           ct3++;
979         }
980       }
981       for (k = oct2, max2 = ct2; k < max2; k++) {
982         row   = xdata[i][k] - rstart;
983         start = ai[row];
984         end   = ai[row + 1];
985         for (l = start; l < end; l++) {
986           val = aj[l] + cstart;
987           if (!PetscBTLookupSet(xtable, val)) {
988             if (!(ct3 < mem_estimate)) {
989               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
990               PetscCall(PetscMalloc1(new_estimate, &tmp));
991               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
992               PetscCall(PetscFree(xdata[0]));
993               xdata[0]     = tmp;
994               mem_estimate = new_estimate;
995               ++no_malloc;
996               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
997             }
998             xdata[i][ct2++] = val;
999             ct3++;
1000           }
1001         }
1002         start = bi[row];
1003         end   = bi[row + 1];
1004         for (l = start; l < end; l++) {
1005           val = garray[bj[l]];
1006           if (!PetscBTLookupSet(xtable, val)) {
1007             if (!(ct3 < mem_estimate)) {
1008               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1009               PetscCall(PetscMalloc1(new_estimate, &tmp));
1010               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1011               PetscCall(PetscFree(xdata[0]));
1012               xdata[0]     = tmp;
1013               mem_estimate = new_estimate;
1014               ++no_malloc;
1015               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1016             }
1017             xdata[i][ct2++] = val;
1018             ct3++;
1019           }
1020         }
1021       }
1022       /* Update the header*/
1023       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1024       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1025     }
1026     xdata[i][0] = rbuf_0;
1027     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1028     isz1[i] = ct2; /* size of each message */
1029   }
1030   PetscCall(PetscBTDestroy(&xtable));
1031   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1032   PetscFunctionReturn(0);
1033 }
1034 /* -------------------------------------------------------------------------*/
1035 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1036 /*
1037     Every processor gets the entire matrix
1038 */
1039 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1040 {
1041   Mat         B;
1042   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1043   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1044   PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1045   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1046   PetscInt    m, *b_sendj, *garray   = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1047 
1048   PetscFunctionBegin;
1049   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1050   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1051   if (scall == MAT_INITIAL_MATRIX) {
1052     /* ----------------------------------------------------------------
1053          Tell every processor the number of nonzeros per row
1054     */
1055     PetscCall(PetscMalloc1(A->rmap->N, &lens));
1056     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1057     PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1058     for (i = 0; i < size; i++) {
1059       recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1060       displs[i]     = A->rmap->range[i];
1061     }
1062     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1063     /* ---------------------------------------------------------------
1064          Create the sequential matrix of the same type as the local block diagonal
1065     */
1066     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1067     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1068     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1069     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1070     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1071     PetscCall(PetscCalloc1(2, Bin));
1072     **Bin = B;
1073     b     = (Mat_SeqAIJ *)B->data;
1074 
1075     /*--------------------------------------------------------------------
1076        Copy my part of matrix column indices over
1077     */
1078     sendcount  = ad->nz + bd->nz;
1079     jsendbuf   = b->j + b->i[rstarts[rank]];
1080     a_jsendbuf = ad->j;
1081     b_jsendbuf = bd->j;
1082     n          = A->rmap->rend - A->rmap->rstart;
1083     cnt        = 0;
1084     for (i = 0; i < n; i++) {
1085       /* put in lower diagonal portion */
1086       m = bd->i[i + 1] - bd->i[i];
1087       while (m > 0) {
1088         /* is it above diagonal (in bd (compressed) numbering) */
1089         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1090         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1091         m--;
1092       }
1093 
1094       /* put in diagonal portion */
1095       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1096 
1097       /* put in upper diagonal portion */
1098       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1099     }
1100     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1101 
1102     /*--------------------------------------------------------------------
1103        Gather all column indices to all processors
1104     */
1105     for (i = 0; i < size; i++) {
1106       recvcounts[i] = 0;
1107       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1108     }
1109     displs[0] = 0;
1110     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1111     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1112     /*--------------------------------------------------------------------
1113         Assemble the matrix into useable form (note numerical values not yet set)
1114     */
1115     /* set the b->ilen (length of each row) values */
1116     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1117     /* set the b->i indices */
1118     b->i[0] = 0;
1119     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1120     PetscCall(PetscFree(lens));
1121     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1122     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1123 
1124   } else {
1125     B = **Bin;
1126     b = (Mat_SeqAIJ *)B->data;
1127   }
1128 
1129   /*--------------------------------------------------------------------
1130        Copy my part of matrix numerical values into the values location
1131   */
1132   if (flag == MAT_GET_VALUES) {
1133     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1134     MatScalar         *sendbuf, *recvbuf;
1135 
1136     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1137     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1138     sendcount = ad->nz + bd->nz;
1139     sendbuf   = b->a + b->i[rstarts[rank]];
1140     a_sendbuf = ada;
1141     b_sendbuf = bda;
1142     b_sendj   = bd->j;
1143     n         = A->rmap->rend - A->rmap->rstart;
1144     cnt       = 0;
1145     for (i = 0; i < n; i++) {
1146       /* put in lower diagonal portion */
1147       m = bd->i[i + 1] - bd->i[i];
1148       while (m > 0) {
1149         /* is it above diagonal (in bd (compressed) numbering) */
1150         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1151         sendbuf[cnt++] = *b_sendbuf++;
1152         m--;
1153         b_sendj++;
1154       }
1155 
1156       /* put in diagonal portion */
1157       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1158 
1159       /* put in upper diagonal portion */
1160       while (m-- > 0) {
1161         sendbuf[cnt++] = *b_sendbuf++;
1162         b_sendj++;
1163       }
1164     }
1165     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1166 
1167     /* -----------------------------------------------------------------
1168        Gather all numerical values to all processors
1169     */
1170     if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1171     for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1172     displs[0] = 0;
1173     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1174     recvbuf = b->a;
1175     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A)));
1176     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1177     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1178   } /* endof (flag == MAT_GET_VALUES) */
1179   PetscCall(PetscFree2(recvcounts, displs));
1180 
1181   PetscCall(MatPropagateSymmetryOptions(A, B));
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1186 {
1187   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1188   Mat             submat, A = c->A, B = c->B;
1189   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1190   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1191   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1192   const PetscInt *icol, *irow;
1193   PetscInt        nrow, ncol, start;
1194   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1195   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1196   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1197   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1198   PetscInt       *lens, rmax, ncols, *cols, Crow;
1199 #if defined(PETSC_USE_CTABLE)
1200   PetscHMapI cmap, rmap;
1201   PetscInt  *cmap_loc, *rmap_loc;
1202 #else
1203   PetscInt *cmap, *rmap;
1204 #endif
1205   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1206   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1207   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1208   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1209   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1210   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1211   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1212   MPI_Comm      comm;
1213   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1214   PetscMPIInt  *onodes1, *olengths1, idex, end;
1215   Mat_SubSppt  *smatis1;
1216   PetscBool     isrowsorted, iscolsorted;
1217 
1218   PetscFunctionBegin;
1219   PetscValidLogicalCollectiveInt(C, ismax, 2);
1220   PetscValidLogicalCollectiveEnum(C, scall, 5);
1221   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1222   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1223   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1224   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1225   size = c->size;
1226   rank = c->rank;
1227 
1228   PetscCall(ISSorted(iscol[0], &iscolsorted));
1229   PetscCall(ISSorted(isrow[0], &isrowsorted));
1230   PetscCall(ISGetIndices(isrow[0], &irow));
1231   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1232   if (allcolumns) {
1233     icol = NULL;
1234     ncol = C->cmap->N;
1235   } else {
1236     PetscCall(ISGetIndices(iscol[0], &icol));
1237     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1238   }
1239 
1240   if (scall == MAT_INITIAL_MATRIX) {
1241     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1242 
1243     /* Get some new tags to keep the communication clean */
1244     tag1 = ((PetscObject)C)->tag;
1245     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1246     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1247 
1248     /* evaluate communication - mesg to who, length of mesg, and buffer space
1249      required. Based on this, buffers are allocated, and data copied into them */
1250     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1251     PetscCall(PetscMalloc1(nrow, &row2proc));
1252 
1253     /* w1[proc] = num of rows owned by proc -- to be requested */
1254     proc = 0;
1255     nrqs = 0; /* num of outgoing messages */
1256     for (j = 0; j < nrow; j++) {
1257       row = irow[j];
1258       if (!isrowsorted) proc = 0;
1259       while (row >= C->rmap->range[proc + 1]) proc++;
1260       w1[proc]++;
1261       row2proc[j] = proc; /* map row index to proc */
1262 
1263       if (proc != rank && !w2[proc]) {
1264         w2[proc] = 1;
1265         nrqs++;
1266       }
1267     }
1268     w1[rank] = 0; /* rows owned by self will not be requested */
1269 
1270     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1271     for (proc = 0, j = 0; proc < size; proc++) {
1272       if (w1[proc]) pa[j++] = proc;
1273     }
1274 
1275     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1276     msz = 0; /* total mesg length (for all procs) */
1277     for (i = 0; i < nrqs; i++) {
1278       proc = pa[i];
1279       w1[proc] += 3;
1280       msz += w1[proc];
1281     }
1282     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
1283 
1284     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1285     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1286     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1287 
1288     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1289        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1290     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1291 
1292     /* Now post the Irecvs corresponding to these messages */
1293     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1294 
1295     PetscCall(PetscFree(onodes1));
1296     PetscCall(PetscFree(olengths1));
1297 
1298     /* Allocate Memory for outgoing messages */
1299     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1300     PetscCall(PetscArrayzero(sbuf1, size));
1301     PetscCall(PetscArrayzero(ptr, size));
1302 
1303     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1304     iptr = tmp;
1305     for (i = 0; i < nrqs; i++) {
1306       proc        = pa[i];
1307       sbuf1[proc] = iptr;
1308       iptr += w1[proc];
1309     }
1310 
1311     /* Form the outgoing messages */
1312     /* Initialize the header space */
1313     for (i = 0; i < nrqs; i++) {
1314       proc = pa[i];
1315       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1316       ptr[proc] = sbuf1[proc] + 3;
1317     }
1318 
1319     /* Parse the isrow and copy data into outbuf */
1320     PetscCall(PetscArrayzero(ctr, size));
1321     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1322       proc = row2proc[j];
1323       if (proc != rank) { /* copy to the outgoing buf*/
1324         *ptr[proc] = irow[j];
1325         ctr[proc]++;
1326         ptr[proc]++;
1327       }
1328     }
1329 
1330     /* Update the headers for the current IS */
1331     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1332       if ((ctr_j = ctr[j])) {
1333         sbuf1_j            = sbuf1[j];
1334         k                  = ++sbuf1_j[0];
1335         sbuf1_j[2 * k]     = ctr_j;
1336         sbuf1_j[2 * k - 1] = 0;
1337       }
1338     }
1339 
1340     /* Now post the sends */
1341     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1342     for (i = 0; i < nrqs; ++i) {
1343       proc = pa[i];
1344       PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1345     }
1346 
1347     /* Post Receives to capture the buffer size */
1348     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1349     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1350 
1351     if (nrqs) rbuf2[0] = tmp + msz;
1352     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1353 
1354     for (i = 0; i < nrqs; ++i) {
1355       proc = pa[i];
1356       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1357     }
1358 
1359     PetscCall(PetscFree2(w1, w2));
1360 
1361     /* Send to other procs the buf size they should allocate */
1362     /* Receive messages*/
1363     PetscCall(PetscMalloc1(nrqr, &r_status1));
1364     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1365 
1366     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1367     for (i = 0; i < nrqr; ++i) {
1368       req_size[i] = 0;
1369       rbuf1_i     = rbuf1[i];
1370       start       = 2 * rbuf1_i[0] + 1;
1371       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1372       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1373       sbuf2_i = sbuf2[i];
1374       for (j = start; j < end; j++) {
1375         k          = rbuf1_i[j] - rstart;
1376         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1377         sbuf2_i[j] = ncols;
1378         req_size[i] += ncols;
1379       }
1380       req_source1[i] = r_status1[i].MPI_SOURCE;
1381 
1382       /* form the header */
1383       sbuf2_i[0] = req_size[i];
1384       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1385 
1386       PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1387     }
1388 
1389     PetscCall(PetscFree(r_status1));
1390     PetscCall(PetscFree(r_waits1));
1391 
1392     /* rbuf2 is received, Post recv column indices a->j */
1393     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1394 
1395     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1396     for (i = 0; i < nrqs; ++i) {
1397       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1398       req_source2[i] = r_status2[i].MPI_SOURCE;
1399       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1400     }
1401 
1402     /* Wait on sends1 and sends2 */
1403     PetscCall(PetscMalloc1(nrqs, &s_status1));
1404     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1405     PetscCall(PetscFree(s_waits1));
1406     PetscCall(PetscFree(s_status1));
1407 
1408     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1409     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1410 
1411     /* Now allocate sending buffers for a->j, and send them off */
1412     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1413     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1414     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1415     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1416 
1417     for (i = 0; i < nrqr; i++) { /* for each requested message */
1418       rbuf1_i   = rbuf1[i];
1419       sbuf_aj_i = sbuf_aj[i];
1420       ct1       = 2 * rbuf1_i[0] + 1;
1421       ct2       = 0;
1422       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1423 
1424       kmax = rbuf1[i][2];
1425       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1426         row    = rbuf1_i[ct1] - rstart;
1427         nzA    = ai[row + 1] - ai[row];
1428         nzB    = bi[row + 1] - bi[row];
1429         ncols  = nzA + nzB;
1430         cworkA = aj + ai[row];
1431         cworkB = bj + bi[row];
1432 
1433         /* load the column indices for this row into cols*/
1434         cols = sbuf_aj_i + ct2;
1435 
1436         lwrite = 0;
1437         for (l = 0; l < nzB; l++) {
1438           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1439         }
1440         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1441         for (l = 0; l < nzB; l++) {
1442           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1443         }
1444 
1445         ct2 += ncols;
1446       }
1447       PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1448     }
1449 
1450     /* create column map (cmap): global col of C -> local col of submat */
1451 #if defined(PETSC_USE_CTABLE)
1452     if (!allcolumns) {
1453       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1454       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1455       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1456         if (icol[j] >= cstart && icol[j] < cend) {
1457           cmap_loc[icol[j] - cstart] = j + 1;
1458         } else { /* use PetscHMapI for non-local col indices */
1459           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1460         }
1461       }
1462     } else {
1463       cmap     = NULL;
1464       cmap_loc = NULL;
1465     }
1466     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1467 #else
1468     if (!allcolumns) {
1469       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1470       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1471     } else {
1472       cmap = NULL;
1473     }
1474 #endif
1475 
1476     /* Create lens for MatSeqAIJSetPreallocation() */
1477     PetscCall(PetscCalloc1(nrow, &lens));
1478 
1479     /* Compute lens from local part of C */
1480     for (j = 0; j < nrow; j++) {
1481       row  = irow[j];
1482       proc = row2proc[j];
1483       if (proc == rank) {
1484         /* diagonal part A = c->A */
1485         ncols = ai[row - rstart + 1] - ai[row - rstart];
1486         cols  = aj + ai[row - rstart];
1487         if (!allcolumns) {
1488           for (k = 0; k < ncols; k++) {
1489 #if defined(PETSC_USE_CTABLE)
1490             tcol = cmap_loc[cols[k]];
1491 #else
1492             tcol = cmap[cols[k] + cstart];
1493 #endif
1494             if (tcol) lens[j]++;
1495           }
1496         } else { /* allcolumns */
1497           lens[j] = ncols;
1498         }
1499 
1500         /* off-diagonal part B = c->B */
1501         ncols = bi[row - rstart + 1] - bi[row - rstart];
1502         cols  = bj + bi[row - rstart];
1503         if (!allcolumns) {
1504           for (k = 0; k < ncols; k++) {
1505 #if defined(PETSC_USE_CTABLE)
1506             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1507 #else
1508             tcol = cmap[bmap[cols[k]]];
1509 #endif
1510             if (tcol) lens[j]++;
1511           }
1512         } else { /* allcolumns */
1513           lens[j] += ncols;
1514         }
1515       }
1516     }
1517 
1518     /* Create row map (rmap): global row of C -> local row of submat */
1519 #if defined(PETSC_USE_CTABLE)
1520     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1521     for (j = 0; j < nrow; j++) {
1522       row  = irow[j];
1523       proc = row2proc[j];
1524       if (proc == rank) { /* a local row */
1525         rmap_loc[row - rstart] = j;
1526       } else {
1527         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1528       }
1529     }
1530 #else
1531     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1532     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1533 #endif
1534 
1535     /* Update lens from offproc data */
1536     /* recv a->j is done */
1537     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1538     for (i = 0; i < nrqs; i++) {
1539       proc    = pa[i];
1540       sbuf1_i = sbuf1[proc];
1541       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1542       ct1     = 2 + 1;
1543       ct2     = 0;
1544       rbuf2_i = rbuf2[i]; /* received length of C->j */
1545       rbuf3_i = rbuf3[i]; /* received C->j */
1546 
1547       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1548       max1 = sbuf1_i[2];
1549       for (k = 0; k < max1; k++, ct1++) {
1550 #if defined(PETSC_USE_CTABLE)
1551         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1552         row--;
1553         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1554 #else
1555         row = rmap[sbuf1_i[ct1]];      /* the row index in submat */
1556 #endif
1557         /* Now, store row index of submat in sbuf1_i[ct1] */
1558         sbuf1_i[ct1] = row;
1559 
1560         nnz = rbuf2_i[ct1];
1561         if (!allcolumns) {
1562           for (l = 0; l < nnz; l++, ct2++) {
1563 #if defined(PETSC_USE_CTABLE)
1564             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1565               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1566             } else {
1567               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1568             }
1569 #else
1570             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1571 #endif
1572             if (tcol) lens[row]++;
1573           }
1574         } else { /* allcolumns */
1575           lens[row] += nnz;
1576         }
1577       }
1578     }
1579     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1580     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1581 
1582     /* Create the submatrices */
1583     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1584     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1585 
1586     PetscCall(ISGetBlockSize(isrow[0], &i));
1587     PetscCall(ISGetBlockSize(iscol[0], &j));
1588     PetscCall(MatSetBlockSizes(submat, i, j));
1589     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1590     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1591 
1592     /* create struct Mat_SubSppt and attached it to submat */
1593     PetscCall(PetscNew(&smatis1));
1594     subc            = (Mat_SeqAIJ *)submat->data;
1595     subc->submatis1 = smatis1;
1596 
1597     smatis1->id          = 0;
1598     smatis1->nrqs        = nrqs;
1599     smatis1->nrqr        = nrqr;
1600     smatis1->rbuf1       = rbuf1;
1601     smatis1->rbuf2       = rbuf2;
1602     smatis1->rbuf3       = rbuf3;
1603     smatis1->sbuf2       = sbuf2;
1604     smatis1->req_source2 = req_source2;
1605 
1606     smatis1->sbuf1 = sbuf1;
1607     smatis1->ptr   = ptr;
1608     smatis1->tmp   = tmp;
1609     smatis1->ctr   = ctr;
1610 
1611     smatis1->pa          = pa;
1612     smatis1->req_size    = req_size;
1613     smatis1->req_source1 = req_source1;
1614 
1615     smatis1->allcolumns = allcolumns;
1616     smatis1->singleis   = PETSC_TRUE;
1617     smatis1->row2proc   = row2proc;
1618     smatis1->rmap       = rmap;
1619     smatis1->cmap       = cmap;
1620 #if defined(PETSC_USE_CTABLE)
1621     smatis1->rmap_loc = rmap_loc;
1622     smatis1->cmap_loc = cmap_loc;
1623 #endif
1624 
1625     smatis1->destroy     = submat->ops->destroy;
1626     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1627     submat->factortype   = C->factortype;
1628 
1629     /* compute rmax */
1630     rmax = 0;
1631     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1632 
1633   } else { /* scall == MAT_REUSE_MATRIX */
1634     submat = submats[0];
1635     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1636 
1637     subc        = (Mat_SeqAIJ *)submat->data;
1638     rmax        = subc->rmax;
1639     smatis1     = subc->submatis1;
1640     nrqs        = smatis1->nrqs;
1641     nrqr        = smatis1->nrqr;
1642     rbuf1       = smatis1->rbuf1;
1643     rbuf2       = smatis1->rbuf2;
1644     rbuf3       = smatis1->rbuf3;
1645     req_source2 = smatis1->req_source2;
1646 
1647     sbuf1 = smatis1->sbuf1;
1648     sbuf2 = smatis1->sbuf2;
1649     ptr   = smatis1->ptr;
1650     tmp   = smatis1->tmp;
1651     ctr   = smatis1->ctr;
1652 
1653     pa          = smatis1->pa;
1654     req_size    = smatis1->req_size;
1655     req_source1 = smatis1->req_source1;
1656 
1657     allcolumns = smatis1->allcolumns;
1658     row2proc   = smatis1->row2proc;
1659     rmap       = smatis1->rmap;
1660     cmap       = smatis1->cmap;
1661 #if defined(PETSC_USE_CTABLE)
1662     rmap_loc = smatis1->rmap_loc;
1663     cmap_loc = smatis1->cmap_loc;
1664 #endif
1665   }
1666 
1667   /* Post recv matrix values */
1668   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1669   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1670   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1671   for (i = 0; i < nrqs; ++i) {
1672     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1673     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1674   }
1675 
1676   /* Allocate sending buffers for a->a, and send them off */
1677   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1678   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1679   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1680   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1681 
1682   for (i = 0; i < nrqr; i++) {
1683     rbuf1_i   = rbuf1[i];
1684     sbuf_aa_i = sbuf_aa[i];
1685     ct1       = 2 * rbuf1_i[0] + 1;
1686     ct2       = 0;
1687     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1688 
1689     kmax = rbuf1_i[2];
1690     for (k = 0; k < kmax; k++, ct1++) {
1691       row    = rbuf1_i[ct1] - rstart;
1692       nzA    = ai[row + 1] - ai[row];
1693       nzB    = bi[row + 1] - bi[row];
1694       ncols  = nzA + nzB;
1695       cworkB = bj + bi[row];
1696       vworkA = a_a + ai[row];
1697       vworkB = b_a + bi[row];
1698 
1699       /* load the column values for this row into vals*/
1700       vals = sbuf_aa_i + ct2;
1701 
1702       lwrite = 0;
1703       for (l = 0; l < nzB; l++) {
1704         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1705       }
1706       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1707       for (l = 0; l < nzB; l++) {
1708         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1709       }
1710 
1711       ct2 += ncols;
1712     }
1713     PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1714   }
1715 
1716   /* Assemble submat */
1717   /* First assemble the local rows */
1718   for (j = 0; j < nrow; j++) {
1719     row  = irow[j];
1720     proc = row2proc[j];
1721     if (proc == rank) {
1722       Crow = row - rstart; /* local row index of C */
1723 #if defined(PETSC_USE_CTABLE)
1724       row = rmap_loc[Crow]; /* row index of submat */
1725 #else
1726       row = rmap[row];
1727 #endif
1728 
1729       if (allcolumns) {
1730         /* diagonal part A = c->A */
1731         ncols = ai[Crow + 1] - ai[Crow];
1732         cols  = aj + ai[Crow];
1733         vals  = a_a + ai[Crow];
1734         i     = 0;
1735         for (k = 0; k < ncols; k++) {
1736           subcols[i]   = cols[k] + cstart;
1737           subvals[i++] = vals[k];
1738         }
1739 
1740         /* off-diagonal part B = c->B */
1741         ncols = bi[Crow + 1] - bi[Crow];
1742         cols  = bj + bi[Crow];
1743         vals  = b_a + bi[Crow];
1744         for (k = 0; k < ncols; k++) {
1745           subcols[i]   = bmap[cols[k]];
1746           subvals[i++] = vals[k];
1747         }
1748 
1749         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1750 
1751       } else { /* !allcolumns */
1752 #if defined(PETSC_USE_CTABLE)
1753         /* diagonal part A = c->A */
1754         ncols = ai[Crow + 1] - ai[Crow];
1755         cols  = aj + ai[Crow];
1756         vals  = a_a + ai[Crow];
1757         i     = 0;
1758         for (k = 0; k < ncols; k++) {
1759           tcol = cmap_loc[cols[k]];
1760           if (tcol) {
1761             subcols[i]   = --tcol;
1762             subvals[i++] = vals[k];
1763           }
1764         }
1765 
1766         /* off-diagonal part B = c->B */
1767         ncols = bi[Crow + 1] - bi[Crow];
1768         cols  = bj + bi[Crow];
1769         vals  = b_a + bi[Crow];
1770         for (k = 0; k < ncols; k++) {
1771           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1772           if (tcol) {
1773             subcols[i]   = --tcol;
1774             subvals[i++] = vals[k];
1775           }
1776         }
1777 #else
1778         /* diagonal part A = c->A */
1779         ncols = ai[Crow + 1] - ai[Crow];
1780         cols  = aj + ai[Crow];
1781         vals  = a_a + ai[Crow];
1782         i     = 0;
1783         for (k = 0; k < ncols; k++) {
1784           tcol = cmap[cols[k] + cstart];
1785           if (tcol) {
1786             subcols[i]   = --tcol;
1787             subvals[i++] = vals[k];
1788           }
1789         }
1790 
1791         /* off-diagonal part B = c->B */
1792         ncols = bi[Crow + 1] - bi[Crow];
1793         cols  = bj + bi[Crow];
1794         vals  = b_a + bi[Crow];
1795         for (k = 0; k < ncols; k++) {
1796           tcol = cmap[bmap[cols[k]]];
1797           if (tcol) {
1798             subcols[i]   = --tcol;
1799             subvals[i++] = vals[k];
1800           }
1801         }
1802 #endif
1803         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1804       }
1805     }
1806   }
1807 
1808   /* Now assemble the off-proc rows */
1809   for (i = 0; i < nrqs; i++) { /* for each requested message */
1810     /* recv values from other processes */
1811     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1812     proc    = pa[idex];
1813     sbuf1_i = sbuf1[proc];
1814     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1815     ct1     = 2 + 1;
1816     ct2     = 0;           /* count of received C->j */
1817     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1818     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1819     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1820     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1821 
1822     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1823     max1 = sbuf1_i[2];                  /* num of rows */
1824     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1825       row = sbuf1_i[ct1];               /* row index of submat */
1826       if (!allcolumns) {
1827         idex = 0;
1828         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1829           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1830           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1831 #if defined(PETSC_USE_CTABLE)
1832             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1833               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1834             } else {
1835               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1836             }
1837 #else
1838             tcol = cmap[rbuf3_i[ct2]];
1839 #endif
1840             if (tcol) {
1841               subcols[idex]   = --tcol; /* may not be sorted */
1842               subvals[idex++] = rbuf4_i[ct2];
1843 
1844               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1845                For reuse, we replace received C->j with index that should be inserted to submat */
1846               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1847             }
1848           }
1849           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1850         } else { /* scall == MAT_REUSE_MATRIX */
1851           submat = submats[0];
1852           subc   = (Mat_SeqAIJ *)submat->data;
1853 
1854           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1855           for (l = 0; l < nnz; l++) {
1856             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1857             subvals[idex++] = rbuf4_i[ct2];
1858           }
1859 
1860           bj = subc->j + subc->i[row]; /* sorted column indices */
1861           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1862         }
1863       } else {              /* allcolumns */
1864         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1865         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES));
1866         ct2 += nnz;
1867       }
1868     }
1869   }
1870 
1871   /* sending a->a are done */
1872   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1873   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1874 
1875   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1876   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1877   submats[0] = submat;
1878 
1879   /* Restore the indices */
1880   PetscCall(ISRestoreIndices(isrow[0], &irow));
1881   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1882 
1883   /* Destroy allocated memory */
1884   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1885   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1886   if (sbuf_aa) {
1887     PetscCall(PetscFree(sbuf_aa[0]));
1888     PetscCall(PetscFree(sbuf_aa));
1889   }
1890 
1891   if (scall == MAT_INITIAL_MATRIX) {
1892     PetscCall(PetscFree(lens));
1893     if (sbuf_aj) {
1894       PetscCall(PetscFree(sbuf_aj[0]));
1895       PetscCall(PetscFree(sbuf_aj));
1896     }
1897   }
1898   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1899   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1900   PetscFunctionReturn(0);
1901 }
1902 
1903 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1904 {
1905   PetscInt  ncol;
1906   PetscBool colflag, allcolumns = PETSC_FALSE;
1907 
1908   PetscFunctionBegin;
1909   /* Allocate memory to hold all the submatrices */
1910   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1911 
1912   /* Check for special case: each processor gets entire matrix columns */
1913   PetscCall(ISIdentity(iscol[0], &colflag));
1914   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1915   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1916 
1917   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1918   PetscFunctionReturn(0);
1919 }
1920 
1921 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1922 {
1923   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1924   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1925   Mat_SeqAIJ  *subc;
1926   Mat_SubSppt *smat;
1927 
1928   PetscFunctionBegin;
1929   /* Check for special case: each processor has a single IS */
1930   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1931     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1932     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1933     PetscFunctionReturn(0);
1934   }
1935 
1936   /* Collect global wantallmatrix and nstages */
1937   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1938   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1939   if (!nmax) nmax = 1;
1940 
1941   if (scall == MAT_INITIAL_MATRIX) {
1942     /* Collect global wantallmatrix and nstages */
1943     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1944       PetscCall(ISIdentity(*isrow, &rowflag));
1945       PetscCall(ISIdentity(*iscol, &colflag));
1946       PetscCall(ISGetLocalSize(*isrow, &nrow));
1947       PetscCall(ISGetLocalSize(*iscol, &ncol));
1948       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1949         wantallmatrix = PETSC_TRUE;
1950 
1951         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1952       }
1953     }
1954 
1955     /* Determine the number of stages through which submatrices are done
1956        Each stage will extract nmax submatrices.
1957        nmax is determined by the matrix column dimension.
1958        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1959     */
1960     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1961 
1962     in[0] = -1 * (PetscInt)wantallmatrix;
1963     in[1] = nstages;
1964     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1965     wantallmatrix = (PetscBool)(-out[0]);
1966     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1967 
1968   } else { /* MAT_REUSE_MATRIX */
1969     if (ismax) {
1970       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
1971       smat = subc->submatis1;
1972     } else { /* (*submat)[0] is a dummy matrix */
1973       smat = (Mat_SubSppt *)(*submat)[0]->data;
1974     }
1975     if (!smat) {
1976       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1977       wantallmatrix = PETSC_TRUE;
1978     } else if (smat->singleis) {
1979       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1980       PetscFunctionReturn(0);
1981     } else {
1982       nstages = smat->nstages;
1983     }
1984   }
1985 
1986   if (wantallmatrix) {
1987     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
1988     PetscFunctionReturn(0);
1989   }
1990 
1991   /* Allocate memory to hold all the submatrices and dummy submatrices */
1992   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
1993 
1994   for (i = 0, pos = 0; i < nstages; i++) {
1995     if (pos + nmax <= ismax) max_no = nmax;
1996     else if (pos >= ismax) max_no = 0;
1997     else max_no = ismax - pos;
1998 
1999     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos));
2000     if (!max_no) {
2001       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2002         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2003         smat->nstages = nstages;
2004       }
2005       pos++; /* advance to next dummy matrix if any */
2006     } else pos += max_no;
2007   }
2008 
2009   if (ismax && scall == MAT_INITIAL_MATRIX) {
2010     /* save nstages for reuse */
2011     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2012     smat          = subc->submatis1;
2013     smat->nstages = nstages;
2014   }
2015   PetscFunctionReturn(0);
2016 }
2017 
2018 /* -------------------------------------------------------------------------*/
2019 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2020 {
2021   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2022   Mat              A = c->A;
2023   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2024   const PetscInt **icol, **irow;
2025   PetscInt        *nrow, *ncol, start;
2026   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2027   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2028   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2029   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2030   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2031 #if defined(PETSC_USE_CTABLE)
2032   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2033 #else
2034   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2035 #endif
2036   const PetscInt *irow_i;
2037   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2038   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2039   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2040   MPI_Comm        comm;
2041   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2042   PetscMPIInt    *onodes1, *olengths1, end;
2043   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2044   Mat_SubSppt    *smat_i;
2045   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2046   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2047 
2048   PetscFunctionBegin;
2049   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2050   size = c->size;
2051   rank = c->rank;
2052 
2053   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2054   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2055 
2056   for (i = 0; i < ismax; i++) {
2057     PetscCall(ISSorted(iscol[i], &issorted[i]));
2058     if (!issorted[i]) iscsorted = issorted[i];
2059 
2060     PetscCall(ISSorted(isrow[i], &issorted[i]));
2061 
2062     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2063     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2064 
2065     /* Check for special case: allcolumn */
2066     PetscCall(ISIdentity(iscol[i], &colflag));
2067     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2068     if (colflag && ncol[i] == C->cmap->N) {
2069       allcolumns[i] = PETSC_TRUE;
2070       icol[i]       = NULL;
2071     } else {
2072       allcolumns[i] = PETSC_FALSE;
2073       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2074     }
2075   }
2076 
2077   if (scall == MAT_REUSE_MATRIX) {
2078     /* Assumes new rows are same length as the old rows */
2079     for (i = 0; i < ismax; i++) {
2080       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2081       subc = (Mat_SeqAIJ *)submats[i]->data;
2082       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2083 
2084       /* Initial matrix as if empty */
2085       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2086 
2087       smat_i = subc->submatis1;
2088 
2089       nrqs        = smat_i->nrqs;
2090       nrqr        = smat_i->nrqr;
2091       rbuf1       = smat_i->rbuf1;
2092       rbuf2       = smat_i->rbuf2;
2093       rbuf3       = smat_i->rbuf3;
2094       req_source2 = smat_i->req_source2;
2095 
2096       sbuf1 = smat_i->sbuf1;
2097       sbuf2 = smat_i->sbuf2;
2098       ptr   = smat_i->ptr;
2099       tmp   = smat_i->tmp;
2100       ctr   = smat_i->ctr;
2101 
2102       pa          = smat_i->pa;
2103       req_size    = smat_i->req_size;
2104       req_source1 = smat_i->req_source1;
2105 
2106       allcolumns[i] = smat_i->allcolumns;
2107       row2proc[i]   = smat_i->row2proc;
2108       rmap[i]       = smat_i->rmap;
2109       cmap[i]       = smat_i->cmap;
2110     }
2111 
2112     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2113       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2114       smat_i = (Mat_SubSppt *)submats[0]->data;
2115 
2116       nrqs        = smat_i->nrqs;
2117       nrqr        = smat_i->nrqr;
2118       rbuf1       = smat_i->rbuf1;
2119       rbuf2       = smat_i->rbuf2;
2120       rbuf3       = smat_i->rbuf3;
2121       req_source2 = smat_i->req_source2;
2122 
2123       sbuf1 = smat_i->sbuf1;
2124       sbuf2 = smat_i->sbuf2;
2125       ptr   = smat_i->ptr;
2126       tmp   = smat_i->tmp;
2127       ctr   = smat_i->ctr;
2128 
2129       pa          = smat_i->pa;
2130       req_size    = smat_i->req_size;
2131       req_source1 = smat_i->req_source1;
2132 
2133       allcolumns[0] = PETSC_FALSE;
2134     }
2135   } else { /* scall == MAT_INITIAL_MATRIX */
2136     /* Get some new tags to keep the communication clean */
2137     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2138     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2139 
2140     /* evaluate communication - mesg to who, length of mesg, and buffer space
2141      required. Based on this, buffers are allocated, and data copied into them*/
2142     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2143 
2144     for (i = 0; i < ismax; i++) {
2145       jmax   = nrow[i];
2146       irow_i = irow[i];
2147 
2148       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2149       row2proc[i] = row2proc_i;
2150 
2151       if (issorted[i]) proc = 0;
2152       for (j = 0; j < jmax; j++) {
2153         if (!issorted[i]) proc = 0;
2154         row = irow_i[j];
2155         while (row >= C->rmap->range[proc + 1]) proc++;
2156         w4[proc]++;
2157         row2proc_i[j] = proc; /* map row index to proc */
2158       }
2159       for (j = 0; j < size; j++) {
2160         if (w4[j]) {
2161           w1[j] += w4[j];
2162           w3[j]++;
2163           w4[j] = 0;
2164         }
2165       }
2166     }
2167 
2168     nrqs     = 0; /* no of outgoing messages */
2169     msz      = 0; /* total mesg length (for all procs) */
2170     w1[rank] = 0; /* no mesg sent to self */
2171     w3[rank] = 0;
2172     for (i = 0; i < size; i++) {
2173       if (w1[i]) {
2174         w2[i] = 1;
2175         nrqs++;
2176       } /* there exists a message to proc i */
2177     }
2178     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2179     for (i = 0, j = 0; i < size; i++) {
2180       if (w1[i]) {
2181         pa[j] = i;
2182         j++;
2183       }
2184     }
2185 
2186     /* Each message would have a header = 1 + 2*(no of IS) + data */
2187     for (i = 0; i < nrqs; i++) {
2188       j = pa[i];
2189       w1[j] += w2[j] + 2 * w3[j];
2190       msz += w1[j];
2191     }
2192     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
2193 
2194     /* Determine the number of messages to expect, their lengths, from from-ids */
2195     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2196     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2197 
2198     /* Now post the Irecvs corresponding to these messages */
2199     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2200     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2201 
2202     /* Allocate Memory for outgoing messages */
2203     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2204     PetscCall(PetscArrayzero(sbuf1, size));
2205     PetscCall(PetscArrayzero(ptr, size));
2206 
2207     {
2208       PetscInt *iptr = tmp;
2209       k              = 0;
2210       for (i = 0; i < nrqs; i++) {
2211         j = pa[i];
2212         iptr += k;
2213         sbuf1[j] = iptr;
2214         k        = w1[j];
2215       }
2216     }
2217 
2218     /* Form the outgoing messages. Initialize the header space */
2219     for (i = 0; i < nrqs; i++) {
2220       j           = pa[i];
2221       sbuf1[j][0] = 0;
2222       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2223       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2224     }
2225 
2226     /* Parse the isrow and copy data into outbuf */
2227     for (i = 0; i < ismax; i++) {
2228       row2proc_i = row2proc[i];
2229       PetscCall(PetscArrayzero(ctr, size));
2230       irow_i = irow[i];
2231       jmax   = nrow[i];
2232       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2233         proc = row2proc_i[j];
2234         if (proc != rank) { /* copy to the outgoing buf*/
2235           ctr[proc]++;
2236           *ptr[proc] = irow_i[j];
2237           ptr[proc]++;
2238         }
2239       }
2240       /* Update the headers for the current IS */
2241       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2242         if ((ctr_j = ctr[j])) {
2243           sbuf1_j            = sbuf1[j];
2244           k                  = ++sbuf1_j[0];
2245           sbuf1_j[2 * k]     = ctr_j;
2246           sbuf1_j[2 * k - 1] = i;
2247         }
2248       }
2249     }
2250 
2251     /*  Now  post the sends */
2252     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2253     for (i = 0; i < nrqs; ++i) {
2254       j = pa[i];
2255       PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2256     }
2257 
2258     /* Post Receives to capture the buffer size */
2259     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2260     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2261     if (nrqs) rbuf2[0] = tmp + msz;
2262     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2263     for (i = 0; i < nrqs; ++i) {
2264       j = pa[i];
2265       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2266     }
2267 
2268     /* Send to other procs the buf size they should allocate */
2269     /* Receive messages*/
2270     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2271     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2272     {
2273       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2274       PetscInt *sbuf2_i;
2275 
2276       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2277       for (i = 0; i < nrqr; ++i) {
2278         req_size[i] = 0;
2279         rbuf1_i     = rbuf1[i];
2280         start       = 2 * rbuf1_i[0] + 1;
2281         end         = olengths1[i];
2282         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2283         sbuf2_i = sbuf2[i];
2284         for (j = start; j < end; j++) {
2285           id         = rbuf1_i[j] - rstart;
2286           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2287           sbuf2_i[j] = ncols;
2288           req_size[i] += ncols;
2289         }
2290         req_source1[i] = onodes1[i];
2291         /* form the header */
2292         sbuf2_i[0] = req_size[i];
2293         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2294 
2295         PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2296       }
2297     }
2298 
2299     PetscCall(PetscFree(onodes1));
2300     PetscCall(PetscFree(olengths1));
2301     PetscCall(PetscFree(r_waits1));
2302     PetscCall(PetscFree4(w1, w2, w3, w4));
2303 
2304     /* Receive messages*/
2305     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2306     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2307     for (i = 0; i < nrqs; ++i) {
2308       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2309       req_source2[i] = pa[i];
2310       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2311     }
2312     PetscCall(PetscFree(r_waits2));
2313 
2314     /* Wait on sends1 and sends2 */
2315     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2316     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2317     PetscCall(PetscFree(s_waits1));
2318     PetscCall(PetscFree(s_waits2));
2319 
2320     /* Now allocate sending buffers for a->j, and send them off */
2321     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2322     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2323     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2324     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2325 
2326     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2327     {
2328       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2329       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2330       PetscInt  cend = C->cmap->rend;
2331       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2332 
2333       for (i = 0; i < nrqr; i++) {
2334         rbuf1_i   = rbuf1[i];
2335         sbuf_aj_i = sbuf_aj[i];
2336         ct1       = 2 * rbuf1_i[0] + 1;
2337         ct2       = 0;
2338         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2339           kmax = rbuf1[i][2 * j];
2340           for (k = 0; k < kmax; k++, ct1++) {
2341             row    = rbuf1_i[ct1] - rstart;
2342             nzA    = a_i[row + 1] - a_i[row];
2343             nzB    = b_i[row + 1] - b_i[row];
2344             ncols  = nzA + nzB;
2345             cworkA = a_j + a_i[row];
2346             cworkB = b_j + b_i[row];
2347 
2348             /* load the column indices for this row into cols */
2349             cols = sbuf_aj_i + ct2;
2350 
2351             lwrite = 0;
2352             for (l = 0; l < nzB; l++) {
2353               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2354             }
2355             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2356             for (l = 0; l < nzB; l++) {
2357               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2358             }
2359 
2360             ct2 += ncols;
2361           }
2362         }
2363         PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2364       }
2365     }
2366 
2367     /* create col map: global col of C -> local col of submatrices */
2368     {
2369       const PetscInt *icol_i;
2370 #if defined(PETSC_USE_CTABLE)
2371       for (i = 0; i < ismax; i++) {
2372         if (!allcolumns[i]) {
2373           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2374 
2375           jmax   = ncol[i];
2376           icol_i = icol[i];
2377           cmap_i = cmap[i];
2378           for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2379         } else cmap[i] = NULL;
2380       }
2381 #else
2382       for (i = 0; i < ismax; i++) {
2383         if (!allcolumns[i]) {
2384           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2385           jmax   = ncol[i];
2386           icol_i = icol[i];
2387           cmap_i = cmap[i];
2388           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2389         } else cmap[i] = NULL;
2390       }
2391 #endif
2392     }
2393 
2394     /* Create lens which is required for MatCreate... */
2395     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2396     PetscCall(PetscMalloc1(ismax, &lens));
2397 
2398     if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2399     for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];
2400 
2401     /* Update lens from local data */
2402     for (i = 0; i < ismax; i++) {
2403       row2proc_i = row2proc[i];
2404       jmax       = nrow[i];
2405       if (!allcolumns[i]) cmap_i = cmap[i];
2406       irow_i = irow[i];
2407       lens_i = lens[i];
2408       for (j = 0; j < jmax; j++) {
2409         row  = irow_i[j];
2410         proc = row2proc_i[j];
2411         if (proc == rank) {
2412           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2413           if (!allcolumns[i]) {
2414             for (k = 0; k < ncols; k++) {
2415 #if defined(PETSC_USE_CTABLE)
2416               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2417 #else
2418               tcol = cmap_i[cols[k]];
2419 #endif
2420               if (tcol) lens_i[j]++;
2421             }
2422           } else { /* allcolumns */
2423             lens_i[j] = ncols;
2424           }
2425           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2426         }
2427       }
2428     }
2429 
2430     /* Create row map: global row of C -> local row of submatrices */
2431 #if defined(PETSC_USE_CTABLE)
2432     for (i = 0; i < ismax; i++) {
2433       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2434       irow_i = irow[i];
2435       jmax   = nrow[i];
2436       for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2437     }
2438 #else
2439     for (i = 0; i < ismax; i++) {
2440       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2441       rmap_i = rmap[i];
2442       irow_i = irow[i];
2443       jmax   = nrow[i];
2444       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2445     }
2446 #endif
2447 
2448     /* Update lens from offproc data */
2449     {
2450       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2451 
2452       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2453       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2454         sbuf1_i = sbuf1[pa[tmp2]];
2455         jmax    = sbuf1_i[0];
2456         ct1     = 2 * jmax + 1;
2457         ct2     = 0;
2458         rbuf2_i = rbuf2[tmp2];
2459         rbuf3_i = rbuf3[tmp2];
2460         for (j = 1; j <= jmax; j++) {
2461           is_no  = sbuf1_i[2 * j - 1];
2462           max1   = sbuf1_i[2 * j];
2463           lens_i = lens[is_no];
2464           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2465           rmap_i = rmap[is_no];
2466           for (k = 0; k < max1; k++, ct1++) {
2467 #if defined(PETSC_USE_CTABLE)
2468             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2469             row--;
2470             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2471 #else
2472             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2473 #endif
2474             max2 = rbuf2_i[ct1];
2475             for (l = 0; l < max2; l++, ct2++) {
2476               if (!allcolumns[is_no]) {
2477 #if defined(PETSC_USE_CTABLE)
2478                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2479 #else
2480                 tcol = cmap_i[rbuf3_i[ct2]];
2481 #endif
2482                 if (tcol) lens_i[row]++;
2483               } else {         /* allcolumns */
2484                 lens_i[row]++; /* lens_i[row] += max2 ? */
2485               }
2486             }
2487           }
2488         }
2489       }
2490     }
2491     PetscCall(PetscFree(r_waits3));
2492     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2493     PetscCall(PetscFree(s_waits3));
2494 
2495     /* Create the submatrices */
2496     for (i = 0; i < ismax; i++) {
2497       PetscInt rbs, cbs;
2498 
2499       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2500       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2501 
2502       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2503       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2504 
2505       PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2506       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2507       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2508 
2509       /* create struct Mat_SubSppt and attached it to submat */
2510       PetscCall(PetscNew(&smat_i));
2511       subc            = (Mat_SeqAIJ *)submats[i]->data;
2512       subc->submatis1 = smat_i;
2513 
2514       smat_i->destroy          = submats[i]->ops->destroy;
2515       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2516       submats[i]->factortype   = C->factortype;
2517 
2518       smat_i->id          = i;
2519       smat_i->nrqs        = nrqs;
2520       smat_i->nrqr        = nrqr;
2521       smat_i->rbuf1       = rbuf1;
2522       smat_i->rbuf2       = rbuf2;
2523       smat_i->rbuf3       = rbuf3;
2524       smat_i->sbuf2       = sbuf2;
2525       smat_i->req_source2 = req_source2;
2526 
2527       smat_i->sbuf1 = sbuf1;
2528       smat_i->ptr   = ptr;
2529       smat_i->tmp   = tmp;
2530       smat_i->ctr   = ctr;
2531 
2532       smat_i->pa          = pa;
2533       smat_i->req_size    = req_size;
2534       smat_i->req_source1 = req_source1;
2535 
2536       smat_i->allcolumns = allcolumns[i];
2537       smat_i->singleis   = PETSC_FALSE;
2538       smat_i->row2proc   = row2proc[i];
2539       smat_i->rmap       = rmap[i];
2540       smat_i->cmap       = cmap[i];
2541     }
2542 
2543     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2544       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2545       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2546       PetscCall(MatSetType(submats[0], MATDUMMY));
2547 
2548       /* create struct Mat_SubSppt and attached it to submat */
2549       PetscCall(PetscNew(&smat_i));
2550       submats[0]->data = (void *)smat_i;
2551 
2552       smat_i->destroy          = submats[0]->ops->destroy;
2553       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2554       submats[0]->factortype   = C->factortype;
2555 
2556       smat_i->id          = 0;
2557       smat_i->nrqs        = nrqs;
2558       smat_i->nrqr        = nrqr;
2559       smat_i->rbuf1       = rbuf1;
2560       smat_i->rbuf2       = rbuf2;
2561       smat_i->rbuf3       = rbuf3;
2562       smat_i->sbuf2       = sbuf2;
2563       smat_i->req_source2 = req_source2;
2564 
2565       smat_i->sbuf1 = sbuf1;
2566       smat_i->ptr   = ptr;
2567       smat_i->tmp   = tmp;
2568       smat_i->ctr   = ctr;
2569 
2570       smat_i->pa          = pa;
2571       smat_i->req_size    = req_size;
2572       smat_i->req_source1 = req_source1;
2573 
2574       smat_i->allcolumns = PETSC_FALSE;
2575       smat_i->singleis   = PETSC_FALSE;
2576       smat_i->row2proc   = NULL;
2577       smat_i->rmap       = NULL;
2578       smat_i->cmap       = NULL;
2579     }
2580 
2581     if (ismax) PetscCall(PetscFree(lens[0]));
2582     PetscCall(PetscFree(lens));
2583     if (sbuf_aj) {
2584       PetscCall(PetscFree(sbuf_aj[0]));
2585       PetscCall(PetscFree(sbuf_aj));
2586     }
2587 
2588   } /* endof scall == MAT_INITIAL_MATRIX */
2589 
2590   /* Post recv matrix values */
2591   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2592   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2593   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2594   for (i = 0; i < nrqs; ++i) {
2595     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2596     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2597   }
2598 
2599   /* Allocate sending buffers for a->a, and send them off */
2600   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2601   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2602   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2603   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2604 
2605   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2606   {
2607     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2608     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2609     PetscInt     cend = C->cmap->rend;
2610     PetscInt    *b_j  = b->j;
2611     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2612 
2613     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2614     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2615     for (i = 0; i < nrqr; i++) {
2616       rbuf1_i   = rbuf1[i];
2617       sbuf_aa_i = sbuf_aa[i];
2618       ct1       = 2 * rbuf1_i[0] + 1;
2619       ct2       = 0;
2620       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2621         kmax = rbuf1_i[2 * j];
2622         for (k = 0; k < kmax; k++, ct1++) {
2623           row    = rbuf1_i[ct1] - rstart;
2624           nzA    = a_i[row + 1] - a_i[row];
2625           nzB    = b_i[row + 1] - b_i[row];
2626           ncols  = nzA + nzB;
2627           cworkB = b_j + b_i[row];
2628           vworkA = a_a + a_i[row];
2629           vworkB = b_a + b_i[row];
2630 
2631           /* load the column values for this row into vals*/
2632           vals = sbuf_aa_i + ct2;
2633 
2634           lwrite = 0;
2635           for (l = 0; l < nzB; l++) {
2636             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2637           }
2638           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2639           for (l = 0; l < nzB; l++) {
2640             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2641           }
2642 
2643           ct2 += ncols;
2644         }
2645       }
2646       PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2647     }
2648     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2649     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2650   }
2651 
2652   /* Assemble the matrices */
2653   /* First assemble the local rows */
2654   for (i = 0; i < ismax; i++) {
2655     row2proc_i = row2proc[i];
2656     subc       = (Mat_SeqAIJ *)submats[i]->data;
2657     imat_ilen  = subc->ilen;
2658     imat_j     = subc->j;
2659     imat_i     = subc->i;
2660     imat_a     = subc->a;
2661 
2662     if (!allcolumns[i]) cmap_i = cmap[i];
2663     rmap_i = rmap[i];
2664     irow_i = irow[i];
2665     jmax   = nrow[i];
2666     for (j = 0; j < jmax; j++) {
2667       row  = irow_i[j];
2668       proc = row2proc_i[j];
2669       if (proc == rank) {
2670         old_row = row;
2671 #if defined(PETSC_USE_CTABLE)
2672         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2673         row--;
2674 #else
2675         row = rmap_i[row];
2676 #endif
2677         ilen_row = imat_ilen[row];
2678         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2679         mat_i = imat_i[row];
2680         mat_a = imat_a + mat_i;
2681         mat_j = imat_j + mat_i;
2682         if (!allcolumns[i]) {
2683           for (k = 0; k < ncols; k++) {
2684 #if defined(PETSC_USE_CTABLE)
2685             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2686 #else
2687             tcol = cmap_i[cols[k]];
2688 #endif
2689             if (tcol) {
2690               *mat_j++ = tcol - 1;
2691               *mat_a++ = vals[k];
2692               ilen_row++;
2693             }
2694           }
2695         } else { /* allcolumns */
2696           for (k = 0; k < ncols; k++) {
2697             *mat_j++ = cols[k]; /* global col index! */
2698             *mat_a++ = vals[k];
2699             ilen_row++;
2700           }
2701         }
2702         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2703 
2704         imat_ilen[row] = ilen_row;
2705       }
2706     }
2707   }
2708 
2709   /* Now assemble the off proc rows */
2710   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2711   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2712     sbuf1_i = sbuf1[pa[tmp2]];
2713     jmax    = sbuf1_i[0];
2714     ct1     = 2 * jmax + 1;
2715     ct2     = 0;
2716     rbuf2_i = rbuf2[tmp2];
2717     rbuf3_i = rbuf3[tmp2];
2718     rbuf4_i = rbuf4[tmp2];
2719     for (j = 1; j <= jmax; j++) {
2720       is_no  = sbuf1_i[2 * j - 1];
2721       rmap_i = rmap[is_no];
2722       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2723       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2724       imat_ilen = subc->ilen;
2725       imat_j    = subc->j;
2726       imat_i    = subc->i;
2727       imat_a    = subc->a;
2728       max1      = sbuf1_i[2 * j];
2729       for (k = 0; k < max1; k++, ct1++) {
2730         row = sbuf1_i[ct1];
2731 #if defined(PETSC_USE_CTABLE)
2732         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2733         row--;
2734 #else
2735         row = rmap_i[row];
2736 #endif
2737         ilen  = imat_ilen[row];
2738         mat_i = imat_i[row];
2739         mat_a = imat_a + mat_i;
2740         mat_j = imat_j + mat_i;
2741         max2  = rbuf2_i[ct1];
2742         if (!allcolumns[is_no]) {
2743           for (l = 0; l < max2; l++, ct2++) {
2744 #if defined(PETSC_USE_CTABLE)
2745             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2746 #else
2747             tcol = cmap_i[rbuf3_i[ct2]];
2748 #endif
2749             if (tcol) {
2750               *mat_j++ = tcol - 1;
2751               *mat_a++ = rbuf4_i[ct2];
2752               ilen++;
2753             }
2754           }
2755         } else { /* allcolumns */
2756           for (l = 0; l < max2; l++, ct2++) {
2757             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2758             *mat_a++ = rbuf4_i[ct2];
2759             ilen++;
2760           }
2761         }
2762         imat_ilen[row] = ilen;
2763       }
2764     }
2765   }
2766 
2767   if (!iscsorted) { /* sort column indices of the rows */
2768     for (i = 0; i < ismax; i++) {
2769       subc      = (Mat_SeqAIJ *)submats[i]->data;
2770       imat_j    = subc->j;
2771       imat_i    = subc->i;
2772       imat_a    = subc->a;
2773       imat_ilen = subc->ilen;
2774 
2775       if (allcolumns[i]) continue;
2776       jmax = nrow[i];
2777       for (j = 0; j < jmax; j++) {
2778         mat_i = imat_i[j];
2779         mat_a = imat_a + mat_i;
2780         mat_j = imat_j + mat_i;
2781         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2782       }
2783     }
2784   }
2785 
2786   PetscCall(PetscFree(r_waits4));
2787   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2788   PetscCall(PetscFree(s_waits4));
2789 
2790   /* Restore the indices */
2791   for (i = 0; i < ismax; i++) {
2792     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2793     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2794   }
2795 
2796   for (i = 0; i < ismax; i++) {
2797     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2798     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2799   }
2800 
2801   /* Destroy allocated memory */
2802   if (sbuf_aa) {
2803     PetscCall(PetscFree(sbuf_aa[0]));
2804     PetscCall(PetscFree(sbuf_aa));
2805   }
2806   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2807 
2808   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2809   PetscCall(PetscFree(rbuf4));
2810 
2811   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2812   PetscFunctionReturn(0);
2813 }
2814 
2815 /*
2816  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2817  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2818  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2819  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2820  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2821  state, and needs to be "assembled" later by compressing B's column space.
2822 
2823  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2824  Following this call, C->A & C->B have been created, even if empty.
2825  */
2826 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2827 {
2828   /* If making this function public, change the error returned in this function away from _PLIB. */
2829   Mat_MPIAIJ     *aij;
2830   Mat_SeqAIJ     *Baij;
2831   PetscBool       seqaij, Bdisassembled;
2832   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2833   PetscScalar     v;
2834   const PetscInt *rowindices, *colindices;
2835 
2836   PetscFunctionBegin;
2837   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2838   if (A) {
2839     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2840     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2841     if (rowemb) {
2842       PetscCall(ISGetLocalSize(rowemb, &m));
2843       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2844     } else {
2845       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2846     }
2847     if (dcolemb) {
2848       PetscCall(ISGetLocalSize(dcolemb, &n));
2849       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2850     } else {
2851       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2852     }
2853   }
2854   if (B) {
2855     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2856     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2857     if (rowemb) {
2858       PetscCall(ISGetLocalSize(rowemb, &m));
2859       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2860     } else {
2861       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2862     }
2863     if (ocolemb) {
2864       PetscCall(ISGetLocalSize(ocolemb, &n));
2865       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2866     } else {
2867       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2868     }
2869   }
2870 
2871   aij = (Mat_MPIAIJ *)C->data;
2872   if (!aij->A) {
2873     /* Mimic parts of MatMPIAIJSetPreallocation() */
2874     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2875     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2876     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2877     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2878   }
2879   if (A) {
2880     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2881   } else {
2882     PetscCall(MatSetUp(aij->A));
2883   }
2884   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2885     /*
2886       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2887       need to "disassemble" B -- convert it to using C's global indices.
2888       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2889 
2890       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2891       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2892 
2893       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2894       At least avoid calling MatSetValues() and the implied searches?
2895     */
2896 
2897     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2898 #if defined(PETSC_USE_CTABLE)
2899       PetscCall(PetscHMapIDestroy(&aij->colmap));
2900 #else
2901       PetscCall(PetscFree(aij->colmap));
2902       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2903 #endif
2904       ngcol = 0;
2905       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2906       if (aij->garray) { PetscCall(PetscFree(aij->garray)); }
2907       PetscCall(VecDestroy(&aij->lvec));
2908       PetscCall(VecScatterDestroy(&aij->Mvctx));
2909     }
2910     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2911     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2912   }
2913   Bdisassembled = PETSC_FALSE;
2914   if (!aij->B) {
2915     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2916     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2917     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2918     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2919     Bdisassembled = PETSC_TRUE;
2920   }
2921   if (B) {
2922     Baij = (Mat_SeqAIJ *)B->data;
2923     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2924       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2925       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2926       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2927       PetscCall(PetscFree(nz));
2928     }
2929 
2930     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2931     shift      = rend - rstart;
2932     count      = 0;
2933     rowindices = NULL;
2934     colindices = NULL;
2935     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2936     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2937     for (i = 0; i < B->rmap->n; i++) {
2938       PetscInt row;
2939       row = i;
2940       if (rowindices) row = rowindices[i];
2941       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2942         col = Baij->j[count];
2943         if (colindices) col = colindices[col];
2944         if (Bdisassembled && col >= rstart) col += shift;
2945         v = Baij->a[count];
2946         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2947         ++count;
2948       }
2949     }
2950     /* No assembly for aij->B is necessary. */
2951     /* FIXME: set aij->B's nonzerostate correctly. */
2952   } else {
2953     PetscCall(MatSetUp(aij->B));
2954   }
2955   C->preallocated  = PETSC_TRUE;
2956   C->was_assembled = PETSC_FALSE;
2957   C->assembled     = PETSC_FALSE;
2958   /*
2959       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2960       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2961    */
2962   PetscFunctionReturn(0);
2963 }
2964 
2965 /*
2966   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2967  */
2968 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2969 {
2970   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
2971 
2972   PetscFunctionBegin;
2973   PetscValidPointer(A, 2);
2974   PetscValidPointer(B, 3);
2975   /* FIXME: make sure C is assembled */
2976   *A = aij->A;
2977   *B = aij->B;
2978   /* Note that we don't incref *A and *B, so be careful! */
2979   PetscFunctionReturn(0);
2980 }
2981 
2982 /*
2983   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2984   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2985 */
2986 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
2987 {
2988   PetscMPIInt size, flag;
2989   PetscInt    i, ii, cismax, ispar;
2990   Mat        *A, *B;
2991   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
2992 
2993   PetscFunctionBegin;
2994   if (!ismax) PetscFunctionReturn(0);
2995 
2996   for (i = 0, cismax = 0; i < ismax; ++i) {
2997     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
2998     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2999     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3000     if (size > 1) ++cismax;
3001   }
3002 
3003   /*
3004      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3005      ispar counts the number of parallel ISs across C's comm.
3006   */
3007   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3008   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3009     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3010     PetscFunctionReturn(0);
3011   }
3012 
3013   /* if (ispar) */
3014   /*
3015     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3016     These are used to extract the off-diag portion of the resulting parallel matrix.
3017     The row IS for the off-diag portion is the same as for the diag portion,
3018     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3019   */
3020   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3021   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3022   for (i = 0, ii = 0; i < ismax; ++i) {
3023     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3024     if (size > 1) {
3025       /*
3026          TODO: This is the part that's ***NOT SCALABLE***.
3027          To fix this we need to extract just the indices of C's nonzero columns
3028          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3029          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3030          to be done without serializing on the IS list, so, most likely, it is best
3031          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3032       */
3033       PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii])));
3034       /* Now we have to
3035          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3036              were sorted on each rank, concatenated they might no longer be sorted;
3037          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3038              indices in the nondecreasing order to the original index positions.
3039          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3040       */
3041       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3042       PetscCall(ISSort(ciscol[ii]));
3043       ++ii;
3044     }
3045   }
3046   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3047   for (i = 0, ii = 0; i < ismax; ++i) {
3048     PetscInt        j, issize;
3049     const PetscInt *indices;
3050 
3051     /*
3052        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3053      */
3054     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3055     PetscCall(ISSort(isrow[i]));
3056     PetscCall(ISGetLocalSize(isrow[i], &issize));
3057     PetscCall(ISGetIndices(isrow[i], &indices));
3058     for (j = 1; j < issize; ++j) {
3059       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3060     }
3061     PetscCall(ISRestoreIndices(isrow[i], &indices));
3062     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3063     PetscCall(ISSort(iscol[i]));
3064     PetscCall(ISGetLocalSize(iscol[i], &issize));
3065     PetscCall(ISGetIndices(iscol[i], &indices));
3066     for (j = 1; j < issize; ++j) {
3067       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3068     }
3069     PetscCall(ISRestoreIndices(iscol[i], &indices));
3070     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3071     if (size > 1) {
3072       cisrow[ii] = isrow[i];
3073       ++ii;
3074     }
3075   }
3076   /*
3077     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3078     array of sequential matrices underlying the resulting parallel matrices.
3079     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3080     contain duplicates.
3081 
3082     There are as many diag matrices as there are original index sets. There are only as many parallel
3083     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3084 
3085     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3086     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3087       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3088       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3089       with A[i] and B[ii] extracted from the corresponding MPI submat.
3090     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3091       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3092       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3093       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3094       values into A[i] and B[ii] sitting inside the corresponding submat.
3095     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3096       A[i], B[ii], AA[i] or BB[ii] matrices.
3097   */
3098   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3099   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3100 
3101   /* Now obtain the sequential A and B submatrices separately. */
3102   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3103   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3104   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3105 
3106   /*
3107     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3108     matrices A & B have been extracted directly into the parallel matrices containing them, or
3109     simply into the sequential matrix identical with the corresponding A (if size == 1).
3110     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3111     to have the same sparsity pattern.
3112     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3113     must be constructed for C. This is done by setseqmat(s).
3114   */
3115   for (i = 0, ii = 0; i < ismax; ++i) {
3116     /*
3117        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3118        That way we can avoid sorting and computing permutations when reusing.
3119        To this end:
3120         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3121         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3122           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3123     */
3124     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3125 
3126     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3127     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3128     if (size > 1) {
3129       if (scall == MAT_INITIAL_MATRIX) {
3130         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3131         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3132         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3133         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3134         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3135       }
3136       /*
3137         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3138       */
3139       {
3140         Mat AA = A[i], BB = B[ii];
3141 
3142         if (AA || BB) {
3143           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3144           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3145           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3146         }
3147         PetscCall(MatDestroy(&AA));
3148       }
3149       PetscCall(ISDestroy(ciscol + ii));
3150       PetscCall(ISDestroy(ciscol_p + ii));
3151       ++ii;
3152     } else { /* if (size == 1) */
3153       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3154       if (isrow_p[i] || iscol_p[i]) {
3155         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3156         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3157         /* Otherwise A is extracted straight into (*submats)[i]. */
3158         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3159         PetscCall(MatDestroy(A + i));
3160       } else (*submat)[i] = A[i];
3161     }
3162     PetscCall(ISDestroy(&isrow_p[i]));
3163     PetscCall(ISDestroy(&iscol_p[i]));
3164   }
3165   PetscCall(PetscFree2(cisrow, ciscol));
3166   PetscCall(PetscFree2(isrow_p, iscol_p));
3167   PetscCall(PetscFree(ciscol_p));
3168   PetscCall(PetscFree(A));
3169   PetscCall(MatDestroySubMatrices(cismax, &B));
3170   PetscFunctionReturn(0);
3171 }
3172 
3173 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3174 {
3175   PetscFunctionBegin;
3176   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3177   PetscFunctionReturn(0);
3178 }
3179