xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
22 {
23   PetscInt i;
24 
25   PetscFunctionBegin;
26   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
27   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
28   PetscFunctionReturn(0);
29 }
30 
31 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
32 {
33   PetscInt i;
34 
35   PetscFunctionBegin;
36   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
37   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
38   PetscFunctionReturn(0);
39 }
40 
41 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
42 {
43   MPI_Comm        comm;
44   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
45   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
46   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
47   const PetscInt *indices_i, **indices;
48   PetscLayout     rmap;
49   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
50   PetscSF         sf;
51   PetscSFNode    *remote;
52 
53   PetscFunctionBegin;
54   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
55   PetscCallMPI(MPI_Comm_rank(comm, &rank));
56   PetscCallMPI(MPI_Comm_size(comm, &size));
57   /* get row map to determine where rows should be going */
58   PetscCall(MatGetLayouts(mat, &rmap, NULL));
59   /* retrieve IS data and put all together so that we
60    * can optimize communication
61    *  */
62   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
63   for (i = 0, tlength = 0; i < nidx; i++) {
64     PetscCall(ISGetLocalSize(is[i], &length[i]));
65     tlength += length[i];
66     PetscCall(ISGetIndices(is[i], &indices[i]));
67   }
68   /* find these rows on remote processors */
69   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
70   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
71   nrrows = 0;
72   for (i = 0; i < nidx; i++) {
73     length_i  = length[i];
74     indices_i = indices[i];
75     for (j = 0; j < length_i; j++) {
76       owner = -1;
77       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
78       /* remote processors */
79       if (owner != rank) {
80         tosizes_temp[owner]++;               /* number of rows to owner */
81         rrow_ranks[nrrows]   = owner;        /* processor */
82         rrow_isids[nrrows]   = i;            /* is id */
83         remoterows[nrrows++] = indices_i[j]; /* row */
84       }
85     }
86     PetscCall(ISRestoreIndices(is[i], &indices[i]));
87   }
88   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
89   /* test if we need to exchange messages
90    * generally speaking, we do not need to exchange
91    * data when overlap is 1
92    * */
93   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
94   /* we do not have any messages
95    * It usually corresponds to overlap 1
96    * */
97   if (!reducednrrows) {
98     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
99     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
100     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
101     PetscFunctionReturn(0);
102   }
103   nto = 0;
104   /* send sizes and ranks for building a two-sided communcation */
105   for (i = 0; i < size; i++) {
106     if (tosizes_temp[i]) {
107       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
108       tosizes_temp[i]  = nto;                 /* a map from processor to index */
109       toranks[nto++]   = i;                   /* processor */
110     }
111   }
112   PetscCall(PetscMalloc1(nto + 1, &toffsets));
113   toffsets[0] = 0;
114   for (i = 0; i < nto; i++) {
115     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
116     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
117   }
118   /* send information to other processors */
119   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
120   nrecvrows = 0;
121   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
122   PetscCall(PetscMalloc1(nrecvrows, &remote));
123   nrecvrows = 0;
124   for (i = 0; i < nfrom; i++) {
125     for (j = 0; j < fromsizes[2 * i]; j++) {
126       remote[nrecvrows].rank    = fromranks[i];
127       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
128     }
129   }
130   PetscCall(PetscSFCreate(comm, &sf));
131   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
132   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
133   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
134   PetscCall(PetscSFSetFromOptions(sf));
135   /* message pair <no of is, row>  */
136   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
137   for (i = 0; i < nrrows; i++) {
138     owner                 = rrow_ranks[i];       /* processor */
139     j                     = tosizes_temp[owner]; /* index */
140     todata[toffsets[j]++] = rrow_isids[i];
141     todata[toffsets[j]++] = remoterows[i];
142   }
143   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
144   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
145   PetscCall(PetscFree(toffsets));
146   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
147   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
148   PetscCall(PetscSFDestroy(&sf));
149   /* send rows belonging to the remote so that then we could get the overlapping data back */
150   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
151   PetscCall(PetscFree2(todata, fromdata));
152   PetscCall(PetscFree(fromsizes));
153   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
154   PetscCall(PetscFree(fromranks));
155   nrecvrows = 0;
156   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
157   PetscCall(PetscCalloc1(nrecvrows, &todata));
158   PetscCall(PetscMalloc1(nrecvrows, &remote));
159   nrecvrows = 0;
160   for (i = 0; i < nto; i++) {
161     for (j = 0; j < tosizes[2 * i]; j++) {
162       remote[nrecvrows].rank    = toranks[i];
163       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
164     }
165   }
166   PetscCall(PetscSFCreate(comm, &sf));
167   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
168   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
169   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
170   PetscCall(PetscSFSetFromOptions(sf));
171   /* overlap communication and computation */
172   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
173   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
174   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
175   PetscCall(PetscSFDestroy(&sf));
176   PetscCall(PetscFree2(sbdata, sbsizes));
177   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
178   PetscCall(PetscFree(toranks));
179   PetscCall(PetscFree(tosizes));
180   PetscCall(PetscFree(todata));
181   PetscFunctionReturn(0);
182 }
183 
184 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
185 {
186   PetscInt       *isz, isz_i, i, j, is_id, data_size;
187   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
188   const PetscInt *indices_i_temp;
189   MPI_Comm       *iscomms;
190 
191   PetscFunctionBegin;
192   max_lsize = 0;
193   PetscCall(PetscMalloc1(nidx, &isz));
194   for (i = 0; i < nidx; i++) {
195     PetscCall(ISGetLocalSize(is[i], &lsize));
196     max_lsize = lsize > max_lsize ? lsize : max_lsize;
197     isz[i]    = lsize;
198   }
199   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
200   for (i = 0; i < nidx; i++) {
201     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
202     PetscCall(ISGetIndices(is[i], &indices_i_temp));
203     PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]));
204     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
205     PetscCall(ISDestroy(&is[i]));
206   }
207   /* retrieve information to get row id and its overlap */
208   for (i = 0; i < nrecvs;) {
209     is_id     = recvdata[i++];
210     data_size = recvdata[i++];
211     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
212     isz_i     = isz[is_id];
213     for (j = 0; j < data_size; j++) {
214       col                = recvdata[i++];
215       indices_i[isz_i++] = col;
216     }
217     isz[is_id] = isz_i;
218   }
219   /* remove duplicate entities */
220   for (i = 0; i < nidx; i++) {
221     indices_i = indices_temp + (max_lsize + nrecvs) * i;
222     isz_i     = isz[i];
223     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
224     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
225     PetscCall(PetscCommDestroy(&iscomms[i]));
226   }
227   PetscCall(PetscFree(isz));
228   PetscCall(PetscFree2(indices_temp, iscomms));
229   PetscFunctionReturn(0);
230 }
231 
232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
233 {
234   PetscLayout     rmap, cmap;
235   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
236   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
237   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
238   const PetscInt *gcols, *ai, *aj, *bi, *bj;
239   Mat             amat, bmat;
240   PetscMPIInt     rank;
241   PetscBool       done;
242   MPI_Comm        comm;
243 
244   PetscFunctionBegin;
245   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
246   PetscCallMPI(MPI_Comm_rank(comm, &rank));
247   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
248   /* Even if the mat is symmetric, we still assume it is not symmetric */
249   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
250   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
251   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
252   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
253   /* total number of nonzero values is used to estimate the memory usage in the next step */
254   tnz = ai[an] + bi[bn];
255   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
256   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
257   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
258   /* to find the longest message */
259   max_fszs = 0;
260   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
261   /* better way to estimate number of nonzero in the mat??? */
262   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
263   for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
264   rows_pos  = 0;
265   totalrows = 0;
266   for (i = 0; i < nfrom; i++) {
267     PetscCall(PetscArrayzero(rows_pos_i, nidx));
268     /* group data together */
269     for (j = 0; j < fromsizes[2 * i]; j += 2) {
270       is_id                       = fromrows[rows_pos++]; /* no of is */
271       rows_i                      = rows_data[is_id];
272       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
273     }
274     /* estimate a space to avoid multiple allocations  */
275     for (j = 0; j < nidx; j++) {
276       indvc_ij = 0;
277       rows_i   = rows_data[j];
278       for (l = 0; l < rows_pos_i[j]; l++) {
279         row   = rows_i[l] - rstart;
280         start = ai[row];
281         end   = ai[row + 1];
282         for (k = start; k < end; k++) { /* Amat */
283           col                     = aj[k] + cstart;
284           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
285         }
286         start = bi[row];
287         end   = bi[row + 1];
288         for (k = start; k < end; k++) { /* Bmat */
289           col                     = gcols[bj[k]];
290           indices_tmp[indvc_ij++] = col;
291         }
292       }
293       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
294       indv_counts[i * nidx + j] = indvc_ij;
295       totalrows += indvc_ij;
296     }
297   }
298   /* message triple <no of is, number of rows, rows> */
299   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
300   totalrows = 0;
301   rows_pos  = 0;
302   /* use this code again */
303   for (i = 0; i < nfrom; i++) {
304     PetscCall(PetscArrayzero(rows_pos_i, nidx));
305     for (j = 0; j < fromsizes[2 * i]; j += 2) {
306       is_id                       = fromrows[rows_pos++];
307       rows_i                      = rows_data[is_id];
308       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
309     }
310     /* add data  */
311     for (j = 0; j < nidx; j++) {
312       if (!indv_counts[i * nidx + j]) continue;
313       indvc_ij            = 0;
314       sbdata[totalrows++] = j;
315       sbdata[totalrows++] = indv_counts[i * nidx + j];
316       sbsizes[2 * i] += 2;
317       rows_i = rows_data[j];
318       for (l = 0; l < rows_pos_i[j]; l++) {
319         row   = rows_i[l] - rstart;
320         start = ai[row];
321         end   = ai[row + 1];
322         for (k = start; k < end; k++) { /* Amat */
323           col                     = aj[k] + cstart;
324           indices_tmp[indvc_ij++] = col;
325         }
326         start = bi[row];
327         end   = bi[row + 1];
328         for (k = start; k < end; k++) { /* Bmat */
329           col                     = gcols[bj[k]];
330           indices_tmp[indvc_ij++] = col;
331         }
332       }
333       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
334       sbsizes[2 * i] += indvc_ij;
335       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
336       totalrows += indvc_ij;
337     }
338   }
339   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
340   offsets[0] = 0;
341   for (i = 0; i < nfrom; i++) {
342     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
343     sbsizes[2 * i + 1] = offsets[i];
344   }
345   PetscCall(PetscFree(offsets));
346   if (sbrowsizes) *sbrowsizes = sbsizes;
347   if (sbrows) *sbrows = sbdata;
348   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
349   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
350   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
351   PetscFunctionReturn(0);
352 }
353 
354 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
355 {
356   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
357   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
358   PetscInt        lsize, lsize_tmp;
359   PetscMPIInt     rank, owner;
360   Mat             amat, bmat;
361   PetscBool       done;
362   PetscLayout     cmap, rmap;
363   MPI_Comm        comm;
364 
365   PetscFunctionBegin;
366   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
367   PetscCallMPI(MPI_Comm_rank(comm, &rank));
368   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
369   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
370   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
371   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
372   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
373   /* is it a safe way to compute number of nonzero values ? */
374   tnz = ai[an] + bi[bn];
375   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
376   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
377   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
378   /* it is a better way to estimate memory than the old implementation
379    * where global size of matrix is used
380    * */
381   PetscCall(PetscMalloc1(tnz, &indices_temp));
382   for (i = 0; i < nidx; i++) {
383     MPI_Comm iscomm;
384 
385     PetscCall(ISGetLocalSize(is[i], &lsize));
386     PetscCall(ISGetIndices(is[i], &indices));
387     lsize_tmp = 0;
388     for (j = 0; j < lsize; j++) {
389       owner = -1;
390       row   = indices[j];
391       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
392       if (owner != rank) continue;
393       /* local number */
394       row -= rstart;
395       start = ai[row];
396       end   = ai[row + 1];
397       for (k = start; k < end; k++) { /* Amat */
398         col                       = aj[k] + cstart;
399         indices_temp[lsize_tmp++] = col;
400       }
401       start = bi[row];
402       end   = bi[row + 1];
403       for (k = start; k < end; k++) { /* Bmat */
404         col                       = gcols[bj[k]];
405         indices_temp[lsize_tmp++] = col;
406       }
407     }
408     PetscCall(ISRestoreIndices(is[i], &indices));
409     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
410     PetscCall(ISDestroy(&is[i]));
411     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
412     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
413     PetscCall(PetscCommDestroy(&iscomm));
414   }
415   PetscCall(PetscFree(indices_temp));
416   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
418   PetscFunctionReturn(0);
419 }
420 
421 /*
422   Sample message format:
423   If a processor A wants processor B to process some elements corresponding
424   to index sets is[1],is[5]
425   mesg [0] = 2   (no of index sets in the mesg)
426   -----------
427   mesg [1] = 1 => is[1]
428   mesg [2] = sizeof(is[1]);
429   -----------
430   mesg [3] = 5  => is[5]
431   mesg [4] = sizeof(is[5]);
432   -----------
433   mesg [5]
434   mesg [n]  datas[1]
435   -----------
436   mesg[n+1]
437   mesg[m]  data(is[5])
438   -----------
439 
440   Notes:
441   nrqs - no of requests sent (or to be sent out)
442   nrqr - no of requests received (which have to be or which have been processed)
443 */
444 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
445 {
446   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
447   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
448   const PetscInt **idx, *idx_i;
449   PetscInt        *n, **data, len;
450 #if defined(PETSC_USE_CTABLE)
451   PetscTable *table_data, table_data_i;
452   PetscInt   *tdata, tcount, tcount_max;
453 #else
454   PetscInt *data_i, *d_p;
455 #endif
456   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
457   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
458   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
459   PetscBT     *table;
460   MPI_Comm     comm;
461   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
462   MPI_Status  *recv_status;
463   MPI_Comm    *iscomms;
464   char        *t_p;
465 
466   PetscFunctionBegin;
467   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
468   size = c->size;
469   rank = c->rank;
470   M    = C->rmap->N;
471 
472   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
473   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
474 
475   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
476 
477   for (i = 0; i < imax; i++) {
478     PetscCall(ISGetIndices(is[i], &idx[i]));
479     PetscCall(ISGetLocalSize(is[i], &n[i]));
480   }
481 
482   /* evaluate communication - mesg to who,length of mesg, and buffer space
483      required. Based on this, buffers are allocated, and data copied into them  */
484   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
485   for (i = 0; i < imax; i++) {
486     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
487     idx_i = idx[i];
488     len   = n[i];
489     for (j = 0; j < len; j++) {
490       row = idx_i[j];
491       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
492       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
493       w4[proc]++;
494     }
495     for (j = 0; j < size; j++) {
496       if (w4[j]) {
497         w1[j] += w4[j];
498         w3[j]++;
499       }
500     }
501   }
502 
503   nrqs     = 0; /* no of outgoing messages */
504   msz      = 0; /* total mesg length (for all proc */
505   w1[rank] = 0; /* no mesg sent to intself */
506   w3[rank] = 0;
507   for (i = 0; i < size; i++) {
508     if (w1[i]) {
509       w2[i] = 1;
510       nrqs++;
511     } /* there exists a message to proc i */
512   }
513   /* pa - is list of processors to communicate with */
514   PetscCall(PetscMalloc1(nrqs, &pa));
515   for (i = 0, j = 0; i < size; i++) {
516     if (w1[i]) {
517       pa[j] = i;
518       j++;
519     }
520   }
521 
522   /* Each message would have a header = 1 + 2*(no of IS) + data */
523   for (i = 0; i < nrqs; i++) {
524     j = pa[i];
525     w1[j] += w2[j] + 2 * w3[j];
526     msz += w1[j];
527   }
528 
529   /* Determine the number of messages to expect, their lengths, from from-ids */
530   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
531   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
532 
533   /* Now post the Irecvs corresponding to these messages */
534   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
535 
536   /* Allocate Memory for outgoing messages */
537   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
538   PetscCall(PetscArrayzero(outdat, size));
539   PetscCall(PetscArrayzero(ptr, size));
540 
541   {
542     PetscInt *iptr = tmp, ict = 0;
543     for (i = 0; i < nrqs; i++) {
544       j = pa[i];
545       iptr += ict;
546       outdat[j] = iptr;
547       ict       = w1[j];
548     }
549   }
550 
551   /* Form the outgoing messages */
552   /* plug in the headers */
553   for (i = 0; i < nrqs; i++) {
554     j            = pa[i];
555     outdat[j][0] = 0;
556     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
557     ptr[j] = outdat[j] + 2 * w3[j] + 1;
558   }
559 
560   /* Memory for doing local proc's work */
561   {
562     PetscInt M_BPB_imax = 0;
563 #if defined(PETSC_USE_CTABLE)
564     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
565     PetscCall(PetscMalloc1(imax, &table_data));
566     for (i = 0; i < imax; i++) PetscCall(PetscTableCreate(n[i], M, &table_data[i]));
567     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
568     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
569 #else
570     PetscInt Mimax = 0;
571     PetscCall(PetscIntMultError(M, imax, &Mimax));
572     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
573     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
574     for (i = 0; i < imax; i++) {
575       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
576       data[i]  = d_p + M * i;
577     }
578 #endif
579   }
580 
581   /* Parse the IS and update local tables and the outgoing buf with the data */
582   {
583     PetscInt n_i, isz_i, *outdat_j, ctr_j;
584     PetscBT  table_i;
585 
586     for (i = 0; i < imax; i++) {
587       PetscCall(PetscArrayzero(ctr, size));
588       n_i     = n[i];
589       table_i = table[i];
590       idx_i   = idx[i];
591 #if defined(PETSC_USE_CTABLE)
592       table_data_i = table_data[i];
593 #else
594       data_i = data[i];
595 #endif
596       isz_i = isz[i];
597       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
598         row = idx_i[j];
599         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
600         if (proc != rank) { /* copy to the outgoing buffer */
601           ctr[proc]++;
602           *ptr[proc] = row;
603           ptr[proc]++;
604         } else if (!PetscBTLookupSet(table_i, row)) {
605 #if defined(PETSC_USE_CTABLE)
606           PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES));
607 #else
608           data_i[isz_i] = row; /* Update the local table */
609 #endif
610           isz_i++;
611         }
612       }
613       /* Update the headers for the current IS */
614       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
615         if ((ctr_j = ctr[j])) {
616           outdat_j            = outdat[j];
617           k                   = ++outdat_j[0];
618           outdat_j[2 * k]     = ctr_j;
619           outdat_j[2 * k - 1] = i;
620         }
621       }
622       isz[i] = isz_i;
623     }
624   }
625 
626   /*  Now  post the sends */
627   PetscCall(PetscMalloc1(nrqs, &s_waits1));
628   for (i = 0; i < nrqs; ++i) {
629     j = pa[i];
630     PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
631   }
632 
633   /* No longer need the original indices */
634   PetscCall(PetscMalloc1(imax, &iscomms));
635   for (i = 0; i < imax; ++i) {
636     PetscCall(ISRestoreIndices(is[i], idx + i));
637     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
638   }
639   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
640 
641   for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
642 
643     /* Do Local work */
644 #if defined(PETSC_USE_CTABLE)
645   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
646 #else
647   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
648 #endif
649 
650   /* Receive messages */
651   PetscCall(PetscMalloc1(nrqr, &recv_status));
652   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
653   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
654 
655   /* Phase 1 sends are complete - deallocate buffers */
656   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
657   PetscCall(PetscFree4(w1, w2, w3, w4));
658 
659   PetscCall(PetscMalloc1(nrqr, &xdata));
660   PetscCall(PetscMalloc1(nrqr, &isz1));
661   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
662   PetscCall(PetscFree(rbuf[0]));
663   PetscCall(PetscFree(rbuf));
664 
665   /* Send the data back */
666   /* Do a global reduction to know the buffer space req for incoming messages */
667   {
668     PetscMPIInt *rw1;
669 
670     PetscCall(PetscCalloc1(size, &rw1));
671 
672     for (i = 0; i < nrqr; ++i) {
673       proc = recv_status[i].MPI_SOURCE;
674 
675       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
676       rw1[proc] = isz1[i];
677     }
678     PetscCall(PetscFree(onodes1));
679     PetscCall(PetscFree(olengths1));
680 
681     /* Determine the number of messages to expect, their lengths, from from-ids */
682     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
683     PetscCall(PetscFree(rw1));
684   }
685   /* Now post the Irecvs corresponding to these messages */
686   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
687 
688   /* Now  post the sends */
689   PetscCall(PetscMalloc1(nrqr, &s_waits2));
690   for (i = 0; i < nrqr; ++i) {
691     j = recv_status[i].MPI_SOURCE;
692     PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
693   }
694 
695   /* receive work done on other processors */
696   {
697     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
698     PetscMPIInt idex;
699     PetscBT     table_i;
700 
701     for (i = 0; i < nrqs; ++i) {
702       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
703       /* Process the message */
704       rbuf2_i = rbuf2[idex];
705       ct1     = 2 * rbuf2_i[0] + 1;
706       jmax    = rbuf2[idex][0];
707       for (j = 1; j <= jmax; j++) {
708         max     = rbuf2_i[2 * j];
709         is_no   = rbuf2_i[2 * j - 1];
710         isz_i   = isz[is_no];
711         table_i = table[is_no];
712 #if defined(PETSC_USE_CTABLE)
713         table_data_i = table_data[is_no];
714 #else
715         data_i = data[is_no];
716 #endif
717         for (k = 0; k < max; k++, ct1++) {
718           row = rbuf2_i[ct1];
719           if (!PetscBTLookupSet(table_i, row)) {
720 #if defined(PETSC_USE_CTABLE)
721             PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES));
722 #else
723             data_i[isz_i] = row;
724 #endif
725             isz_i++;
726           }
727         }
728         isz[is_no] = isz_i;
729       }
730     }
731 
732     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
733   }
734 
735 #if defined(PETSC_USE_CTABLE)
736   tcount_max = 0;
737   for (i = 0; i < imax; ++i) {
738     table_data_i = table_data[i];
739     PetscCall(PetscTableGetCount(table_data_i, &tcount));
740     if (tcount_max < tcount) tcount_max = tcount;
741   }
742   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
743 #endif
744 
745   for (i = 0; i < imax; ++i) {
746 #if defined(PETSC_USE_CTABLE)
747     PetscTablePosition tpos;
748     table_data_i = table_data[i];
749 
750     PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos));
751     while (tpos) {
752       PetscCall(PetscTableGetNext(table_data_i, &tpos, &k, &j));
753       tdata[--j] = --k;
754     }
755     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
756 #else
757     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
758 #endif
759     PetscCall(PetscCommDestroy(&iscomms[i]));
760   }
761 
762   PetscCall(PetscFree(iscomms));
763   PetscCall(PetscFree(onodes2));
764   PetscCall(PetscFree(olengths2));
765 
766   PetscCall(PetscFree(pa));
767   PetscCall(PetscFree(rbuf2[0]));
768   PetscCall(PetscFree(rbuf2));
769   PetscCall(PetscFree(s_waits1));
770   PetscCall(PetscFree(r_waits1));
771   PetscCall(PetscFree(s_waits2));
772   PetscCall(PetscFree(r_waits2));
773   PetscCall(PetscFree(recv_status));
774   if (xdata) {
775     PetscCall(PetscFree(xdata[0]));
776     PetscCall(PetscFree(xdata));
777   }
778   PetscCall(PetscFree(isz1));
779 #if defined(PETSC_USE_CTABLE)
780   for (i = 0; i < imax; i++) PetscCall(PetscTableDestroy((PetscTable *)&table_data[i]));
781   PetscCall(PetscFree(table_data));
782   PetscCall(PetscFree(tdata));
783   PetscCall(PetscFree4(table, data, isz, t_p));
784 #else
785   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
786 #endif
787   PetscFunctionReturn(0);
788 }
789 
790 /*
791    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
792        the work on the local processor.
793 
794      Inputs:
795       C      - MAT_MPIAIJ;
796       imax - total no of index sets processed at a time;
797       table  - an array of char - size = m bits.
798 
799      Output:
800       isz    - array containing the count of the solution elements corresponding
801                to each index set;
802       data or table_data  - pointer to the solutions
803 */
804 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data)
805 {
806   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
807   Mat         A = c->A, B = c->B;
808   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
809   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
810   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
811   PetscBT     table_i;
812 #if defined(PETSC_USE_CTABLE)
813   PetscTable         table_data_i;
814   PetscTablePosition tpos;
815   PetscInt           tcount, *tdata;
816 #else
817   PetscInt *data_i;
818 #endif
819 
820   PetscFunctionBegin;
821   rstart = C->rmap->rstart;
822   cstart = C->cmap->rstart;
823   ai     = a->i;
824   aj     = a->j;
825   bi     = b->i;
826   bj     = b->j;
827   garray = c->garray;
828 
829   for (i = 0; i < imax; i++) {
830 #if defined(PETSC_USE_CTABLE)
831     /* copy existing entries of table_data_i into tdata[] */
832     table_data_i = table_data[i];
833     PetscCall(PetscTableGetCount(table_data_i, &tcount));
834     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
835 
836     PetscCall(PetscMalloc1(tcount, &tdata));
837     PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos));
838     while (tpos) {
839       PetscCall(PetscTableGetNext(table_data_i, &tpos, &row, &j));
840       tdata[--j] = --row;
841       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
842     }
843 #else
844     data_i = data[i];
845 #endif
846     table_i = table[i];
847     isz_i   = isz[i];
848     max     = isz[i];
849 
850     for (j = 0; j < max; j++) {
851 #if defined(PETSC_USE_CTABLE)
852       row = tdata[j] - rstart;
853 #else
854       row = data_i[j] - rstart;
855 #endif
856       start = ai[row];
857       end   = ai[row + 1];
858       for (k = start; k < end; k++) { /* Amat */
859         val = aj[k] + cstart;
860         if (!PetscBTLookupSet(table_i, val)) {
861 #if defined(PETSC_USE_CTABLE)
862           PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES));
863 #else
864           data_i[isz_i] = val;
865 #endif
866           isz_i++;
867         }
868       }
869       start = bi[row];
870       end   = bi[row + 1];
871       for (k = start; k < end; k++) { /* Bmat */
872         val = garray[bj[k]];
873         if (!PetscBTLookupSet(table_i, val)) {
874 #if defined(PETSC_USE_CTABLE)
875           PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES));
876 #else
877           data_i[isz_i] = val;
878 #endif
879           isz_i++;
880         }
881       }
882     }
883     isz[i] = isz_i;
884 
885 #if defined(PETSC_USE_CTABLE)
886     PetscCall(PetscFree(tdata));
887 #endif
888   }
889   PetscFunctionReturn(0);
890 }
891 
892 /*
893       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
894          and return the output
895 
896          Input:
897            C    - the matrix
898            nrqr - no of messages being processed.
899            rbuf - an array of pointers to the received requests
900 
901          Output:
902            xdata - array of messages to be sent back
903            isz1  - size of each message
904 
905   For better efficiency perhaps we should malloc separately each xdata[i],
906 then if a remalloc is required we need only copy the data for that one row
907 rather then all previous rows as it is now where a single large chunk of
908 memory is used.
909 
910 */
911 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
912 {
913   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
914   Mat         A = c->A, B = c->B;
915   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
916   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
917   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
918   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
919   PetscInt   *rbuf_i, kmax, rbuf_0;
920   PetscBT     xtable;
921 
922   PetscFunctionBegin;
923   m      = C->rmap->N;
924   rstart = C->rmap->rstart;
925   cstart = C->cmap->rstart;
926   ai     = a->i;
927   aj     = a->j;
928   bi     = b->i;
929   bj     = b->j;
930   garray = c->garray;
931 
932   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
933     rbuf_i = rbuf[i];
934     rbuf_0 = rbuf_i[0];
935     ct += rbuf_0;
936     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
937   }
938 
939   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
940   else max1 = 1;
941   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
942   if (nrqr) {
943     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
944     ++no_malloc;
945   }
946   PetscCall(PetscBTCreate(m, &xtable));
947   PetscCall(PetscArrayzero(isz1, nrqr));
948 
949   ct3 = 0;
950   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
951     rbuf_i = rbuf[i];
952     rbuf_0 = rbuf_i[0];
953     ct1    = 2 * rbuf_0 + 1;
954     ct2    = ct1;
955     ct3 += ct1;
956     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
957       PetscCall(PetscBTMemzero(m, xtable));
958       oct2 = ct2;
959       kmax = rbuf_i[2 * j];
960       for (k = 0; k < kmax; k++, ct1++) {
961         row = rbuf_i[ct1];
962         if (!PetscBTLookupSet(xtable, row)) {
963           if (!(ct3 < mem_estimate)) {
964             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
965             PetscCall(PetscMalloc1(new_estimate, &tmp));
966             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
967             PetscCall(PetscFree(xdata[0]));
968             xdata[0]     = tmp;
969             mem_estimate = new_estimate;
970             ++no_malloc;
971             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
972           }
973           xdata[i][ct2++] = row;
974           ct3++;
975         }
976       }
977       for (k = oct2, max2 = ct2; k < max2; k++) {
978         row   = xdata[i][k] - rstart;
979         start = ai[row];
980         end   = ai[row + 1];
981         for (l = start; l < end; l++) {
982           val = aj[l] + cstart;
983           if (!PetscBTLookupSet(xtable, val)) {
984             if (!(ct3 < mem_estimate)) {
985               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
986               PetscCall(PetscMalloc1(new_estimate, &tmp));
987               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
988               PetscCall(PetscFree(xdata[0]));
989               xdata[0]     = tmp;
990               mem_estimate = new_estimate;
991               ++no_malloc;
992               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
993             }
994             xdata[i][ct2++] = val;
995             ct3++;
996           }
997         }
998         start = bi[row];
999         end   = bi[row + 1];
1000         for (l = start; l < end; l++) {
1001           val = garray[bj[l]];
1002           if (!PetscBTLookupSet(xtable, val)) {
1003             if (!(ct3 < mem_estimate)) {
1004               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1005               PetscCall(PetscMalloc1(new_estimate, &tmp));
1006               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1007               PetscCall(PetscFree(xdata[0]));
1008               xdata[0]     = tmp;
1009               mem_estimate = new_estimate;
1010               ++no_malloc;
1011               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1012             }
1013             xdata[i][ct2++] = val;
1014             ct3++;
1015           }
1016         }
1017       }
1018       /* Update the header*/
1019       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1020       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1021     }
1022     xdata[i][0] = rbuf_0;
1023     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1024     isz1[i] = ct2; /* size of each message */
1025   }
1026   PetscCall(PetscBTDestroy(&xtable));
1027   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1028   PetscFunctionReturn(0);
1029 }
1030 /* -------------------------------------------------------------------------*/
1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1032 /*
1033     Every processor gets the entire matrix
1034 */
1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1036 {
1037   Mat         B;
1038   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1039   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1040   PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1041   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1042   PetscInt    m, *b_sendj, *garray   = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1043 
1044   PetscFunctionBegin;
1045   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1046   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1047   if (scall == MAT_INITIAL_MATRIX) {
1048     /* ----------------------------------------------------------------
1049          Tell every processor the number of nonzeros per row
1050     */
1051     PetscCall(PetscMalloc1(A->rmap->N, &lens));
1052     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1053     PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1054     for (i = 0; i < size; i++) {
1055       recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1056       displs[i]     = A->rmap->range[i];
1057     }
1058     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1059     /* ---------------------------------------------------------------
1060          Create the sequential matrix of the same type as the local block diagonal
1061     */
1062     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1063     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1064     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1065     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1066     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1067     PetscCall(PetscCalloc1(2, Bin));
1068     **Bin = B;
1069     b     = (Mat_SeqAIJ *)B->data;
1070 
1071     /*--------------------------------------------------------------------
1072        Copy my part of matrix column indices over
1073     */
1074     sendcount  = ad->nz + bd->nz;
1075     jsendbuf   = b->j + b->i[rstarts[rank]];
1076     a_jsendbuf = ad->j;
1077     b_jsendbuf = bd->j;
1078     n          = A->rmap->rend - A->rmap->rstart;
1079     cnt        = 0;
1080     for (i = 0; i < n; i++) {
1081       /* put in lower diagonal portion */
1082       m = bd->i[i + 1] - bd->i[i];
1083       while (m > 0) {
1084         /* is it above diagonal (in bd (compressed) numbering) */
1085         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1086         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1087         m--;
1088       }
1089 
1090       /* put in diagonal portion */
1091       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1092 
1093       /* put in upper diagonal portion */
1094       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1095     }
1096     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1097 
1098     /*--------------------------------------------------------------------
1099        Gather all column indices to all processors
1100     */
1101     for (i = 0; i < size; i++) {
1102       recvcounts[i] = 0;
1103       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1104     }
1105     displs[0] = 0;
1106     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1107     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1108     /*--------------------------------------------------------------------
1109         Assemble the matrix into useable form (note numerical values not yet set)
1110     */
1111     /* set the b->ilen (length of each row) values */
1112     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1113     /* set the b->i indices */
1114     b->i[0] = 0;
1115     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1116     PetscCall(PetscFree(lens));
1117     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1118     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1119 
1120   } else {
1121     B = **Bin;
1122     b = (Mat_SeqAIJ *)B->data;
1123   }
1124 
1125   /*--------------------------------------------------------------------
1126        Copy my part of matrix numerical values into the values location
1127   */
1128   if (flag == MAT_GET_VALUES) {
1129     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1130     MatScalar         *sendbuf, *recvbuf;
1131 
1132     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1133     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1134     sendcount = ad->nz + bd->nz;
1135     sendbuf   = b->a + b->i[rstarts[rank]];
1136     a_sendbuf = ada;
1137     b_sendbuf = bda;
1138     b_sendj   = bd->j;
1139     n         = A->rmap->rend - A->rmap->rstart;
1140     cnt       = 0;
1141     for (i = 0; i < n; i++) {
1142       /* put in lower diagonal portion */
1143       m = bd->i[i + 1] - bd->i[i];
1144       while (m > 0) {
1145         /* is it above diagonal (in bd (compressed) numbering) */
1146         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1147         sendbuf[cnt++] = *b_sendbuf++;
1148         m--;
1149         b_sendj++;
1150       }
1151 
1152       /* put in diagonal portion */
1153       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1154 
1155       /* put in upper diagonal portion */
1156       while (m-- > 0) {
1157         sendbuf[cnt++] = *b_sendbuf++;
1158         b_sendj++;
1159       }
1160     }
1161     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1162 
1163     /* -----------------------------------------------------------------
1164        Gather all numerical values to all processors
1165     */
1166     if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1167     for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1168     displs[0] = 0;
1169     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1170     recvbuf = b->a;
1171     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A)));
1172     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1173     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1174   } /* endof (flag == MAT_GET_VALUES) */
1175   PetscCall(PetscFree2(recvcounts, displs));
1176 
1177   PetscCall(MatPropagateSymmetryOptions(A, B));
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1182 {
1183   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1184   Mat             submat, A = c->A, B = c->B;
1185   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1186   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1187   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1188   const PetscInt *icol, *irow;
1189   PetscInt        nrow, ncol, start;
1190   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1191   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1192   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1193   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1194   PetscInt       *lens, rmax, ncols, *cols, Crow;
1195 #if defined(PETSC_USE_CTABLE)
1196   PetscTable cmap, rmap;
1197   PetscInt  *cmap_loc, *rmap_loc;
1198 #else
1199   PetscInt *cmap, *rmap;
1200 #endif
1201   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1202   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1203   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1204   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1205   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1206   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1207   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1208   MPI_Comm      comm;
1209   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1210   PetscMPIInt  *onodes1, *olengths1, idex, end;
1211   Mat_SubSppt  *smatis1;
1212   PetscBool     isrowsorted, iscolsorted;
1213 
1214   PetscFunctionBegin;
1215   PetscValidLogicalCollectiveInt(C, ismax, 2);
1216   PetscValidLogicalCollectiveEnum(C, scall, 5);
1217   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1218   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1219   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1220   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1221   size = c->size;
1222   rank = c->rank;
1223 
1224   PetscCall(ISSorted(iscol[0], &iscolsorted));
1225   PetscCall(ISSorted(isrow[0], &isrowsorted));
1226   PetscCall(ISGetIndices(isrow[0], &irow));
1227   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1228   if (allcolumns) {
1229     icol = NULL;
1230     ncol = C->cmap->N;
1231   } else {
1232     PetscCall(ISGetIndices(iscol[0], &icol));
1233     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1234   }
1235 
1236   if (scall == MAT_INITIAL_MATRIX) {
1237     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1238 
1239     /* Get some new tags to keep the communication clean */
1240     tag1 = ((PetscObject)C)->tag;
1241     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1242     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1243 
1244     /* evaluate communication - mesg to who, length of mesg, and buffer space
1245      required. Based on this, buffers are allocated, and data copied into them */
1246     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1247     PetscCall(PetscMalloc1(nrow, &row2proc));
1248 
1249     /* w1[proc] = num of rows owned by proc -- to be requested */
1250     proc = 0;
1251     nrqs = 0; /* num of outgoing messages */
1252     for (j = 0; j < nrow; j++) {
1253       row = irow[j];
1254       if (!isrowsorted) proc = 0;
1255       while (row >= C->rmap->range[proc + 1]) proc++;
1256       w1[proc]++;
1257       row2proc[j] = proc; /* map row index to proc */
1258 
1259       if (proc != rank && !w2[proc]) {
1260         w2[proc] = 1;
1261         nrqs++;
1262       }
1263     }
1264     w1[rank] = 0; /* rows owned by self will not be requested */
1265 
1266     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1267     for (proc = 0, j = 0; proc < size; proc++) {
1268       if (w1[proc]) pa[j++] = proc;
1269     }
1270 
1271     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1272     msz = 0; /* total mesg length (for all procs) */
1273     for (i = 0; i < nrqs; i++) {
1274       proc = pa[i];
1275       w1[proc] += 3;
1276       msz += w1[proc];
1277     }
1278     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
1279 
1280     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1281     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1282     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1283 
1284     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1285        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1286     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1287 
1288     /* Now post the Irecvs corresponding to these messages */
1289     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1290 
1291     PetscCall(PetscFree(onodes1));
1292     PetscCall(PetscFree(olengths1));
1293 
1294     /* Allocate Memory for outgoing messages */
1295     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1296     PetscCall(PetscArrayzero(sbuf1, size));
1297     PetscCall(PetscArrayzero(ptr, size));
1298 
1299     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1300     iptr = tmp;
1301     for (i = 0; i < nrqs; i++) {
1302       proc        = pa[i];
1303       sbuf1[proc] = iptr;
1304       iptr += w1[proc];
1305     }
1306 
1307     /* Form the outgoing messages */
1308     /* Initialize the header space */
1309     for (i = 0; i < nrqs; i++) {
1310       proc = pa[i];
1311       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1312       ptr[proc] = sbuf1[proc] + 3;
1313     }
1314 
1315     /* Parse the isrow and copy data into outbuf */
1316     PetscCall(PetscArrayzero(ctr, size));
1317     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1318       proc = row2proc[j];
1319       if (proc != rank) { /* copy to the outgoing buf*/
1320         *ptr[proc] = irow[j];
1321         ctr[proc]++;
1322         ptr[proc]++;
1323       }
1324     }
1325 
1326     /* Update the headers for the current IS */
1327     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1328       if ((ctr_j = ctr[j])) {
1329         sbuf1_j            = sbuf1[j];
1330         k                  = ++sbuf1_j[0];
1331         sbuf1_j[2 * k]     = ctr_j;
1332         sbuf1_j[2 * k - 1] = 0;
1333       }
1334     }
1335 
1336     /* Now post the sends */
1337     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1338     for (i = 0; i < nrqs; ++i) {
1339       proc = pa[i];
1340       PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1341     }
1342 
1343     /* Post Receives to capture the buffer size */
1344     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1345     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1346 
1347     if (nrqs) rbuf2[0] = tmp + msz;
1348     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1349 
1350     for (i = 0; i < nrqs; ++i) {
1351       proc = pa[i];
1352       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1353     }
1354 
1355     PetscCall(PetscFree2(w1, w2));
1356 
1357     /* Send to other procs the buf size they should allocate */
1358     /* Receive messages*/
1359     PetscCall(PetscMalloc1(nrqr, &r_status1));
1360     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1361 
1362     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1363     for (i = 0; i < nrqr; ++i) {
1364       req_size[i] = 0;
1365       rbuf1_i     = rbuf1[i];
1366       start       = 2 * rbuf1_i[0] + 1;
1367       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1368       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1369       sbuf2_i = sbuf2[i];
1370       for (j = start; j < end; j++) {
1371         k          = rbuf1_i[j] - rstart;
1372         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1373         sbuf2_i[j] = ncols;
1374         req_size[i] += ncols;
1375       }
1376       req_source1[i] = r_status1[i].MPI_SOURCE;
1377 
1378       /* form the header */
1379       sbuf2_i[0] = req_size[i];
1380       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1381 
1382       PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1383     }
1384 
1385     PetscCall(PetscFree(r_status1));
1386     PetscCall(PetscFree(r_waits1));
1387 
1388     /* rbuf2 is received, Post recv column indices a->j */
1389     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1390 
1391     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1392     for (i = 0; i < nrqs; ++i) {
1393       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1394       req_source2[i] = r_status2[i].MPI_SOURCE;
1395       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1396     }
1397 
1398     /* Wait on sends1 and sends2 */
1399     PetscCall(PetscMalloc1(nrqs, &s_status1));
1400     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1401     PetscCall(PetscFree(s_waits1));
1402     PetscCall(PetscFree(s_status1));
1403 
1404     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1405     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1406 
1407     /* Now allocate sending buffers for a->j, and send them off */
1408     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1409     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1410     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1411     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1412 
1413     for (i = 0; i < nrqr; i++) { /* for each requested message */
1414       rbuf1_i   = rbuf1[i];
1415       sbuf_aj_i = sbuf_aj[i];
1416       ct1       = 2 * rbuf1_i[0] + 1;
1417       ct2       = 0;
1418       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1419 
1420       kmax = rbuf1[i][2];
1421       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1422         row    = rbuf1_i[ct1] - rstart;
1423         nzA    = ai[row + 1] - ai[row];
1424         nzB    = bi[row + 1] - bi[row];
1425         ncols  = nzA + nzB;
1426         cworkA = aj + ai[row];
1427         cworkB = bj + bi[row];
1428 
1429         /* load the column indices for this row into cols*/
1430         cols = sbuf_aj_i + ct2;
1431 
1432         lwrite = 0;
1433         for (l = 0; l < nzB; l++) {
1434           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1435         }
1436         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1437         for (l = 0; l < nzB; l++) {
1438           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1439         }
1440 
1441         ct2 += ncols;
1442       }
1443       PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1444     }
1445 
1446     /* create column map (cmap): global col of C -> local col of submat */
1447 #if defined(PETSC_USE_CTABLE)
1448     if (!allcolumns) {
1449       PetscCall(PetscTableCreate(ncol, C->cmap->N, &cmap));
1450       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1451       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1452         if (icol[j] >= cstart && icol[j] < cend) {
1453           cmap_loc[icol[j] - cstart] = j + 1;
1454         } else { /* use PetscTable for non-local col indices */
1455           PetscCall(PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES));
1456         }
1457       }
1458     } else {
1459       cmap     = NULL;
1460       cmap_loc = NULL;
1461     }
1462     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1463 #else
1464     if (!allcolumns) {
1465       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1466       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1467     } else {
1468       cmap = NULL;
1469     }
1470 #endif
1471 
1472     /* Create lens for MatSeqAIJSetPreallocation() */
1473     PetscCall(PetscCalloc1(nrow, &lens));
1474 
1475     /* Compute lens from local part of C */
1476     for (j = 0; j < nrow; j++) {
1477       row  = irow[j];
1478       proc = row2proc[j];
1479       if (proc == rank) {
1480         /* diagonal part A = c->A */
1481         ncols = ai[row - rstart + 1] - ai[row - rstart];
1482         cols  = aj + ai[row - rstart];
1483         if (!allcolumns) {
1484           for (k = 0; k < ncols; k++) {
1485 #if defined(PETSC_USE_CTABLE)
1486             tcol = cmap_loc[cols[k]];
1487 #else
1488             tcol = cmap[cols[k] + cstart];
1489 #endif
1490             if (tcol) lens[j]++;
1491           }
1492         } else { /* allcolumns */
1493           lens[j] = ncols;
1494         }
1495 
1496         /* off-diagonal part B = c->B */
1497         ncols = bi[row - rstart + 1] - bi[row - rstart];
1498         cols  = bj + bi[row - rstart];
1499         if (!allcolumns) {
1500           for (k = 0; k < ncols; k++) {
1501 #if defined(PETSC_USE_CTABLE)
1502             PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol));
1503 #else
1504             tcol = cmap[bmap[cols[k]]];
1505 #endif
1506             if (tcol) lens[j]++;
1507           }
1508         } else { /* allcolumns */
1509           lens[j] += ncols;
1510         }
1511       }
1512     }
1513 
1514     /* Create row map (rmap): global row of C -> local row of submat */
1515 #if defined(PETSC_USE_CTABLE)
1516     PetscCall(PetscTableCreate(nrow, C->rmap->N, &rmap));
1517     for (j = 0; j < nrow; j++) {
1518       row  = irow[j];
1519       proc = row2proc[j];
1520       if (proc == rank) { /* a local row */
1521         rmap_loc[row - rstart] = j;
1522       } else {
1523         PetscCall(PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES));
1524       }
1525     }
1526 #else
1527     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1528     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1529 #endif
1530 
1531     /* Update lens from offproc data */
1532     /* recv a->j is done */
1533     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1534     for (i = 0; i < nrqs; i++) {
1535       proc    = pa[i];
1536       sbuf1_i = sbuf1[proc];
1537       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1538       ct1     = 2 + 1;
1539       ct2     = 0;
1540       rbuf2_i = rbuf2[i]; /* received length of C->j */
1541       rbuf3_i = rbuf3[i]; /* received C->j */
1542 
1543       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1544       max1 = sbuf1_i[2];
1545       for (k = 0; k < max1; k++, ct1++) {
1546 #if defined(PETSC_USE_CTABLE)
1547         PetscCall(PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row));
1548         row--;
1549         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1550 #else
1551         row = rmap[sbuf1_i[ct1]];      /* the row index in submat */
1552 #endif
1553         /* Now, store row index of submat in sbuf1_i[ct1] */
1554         sbuf1_i[ct1] = row;
1555 
1556         nnz = rbuf2_i[ct1];
1557         if (!allcolumns) {
1558           for (l = 0; l < nnz; l++, ct2++) {
1559 #if defined(PETSC_USE_CTABLE)
1560             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1561               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1562             } else {
1563               PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol));
1564             }
1565 #else
1566             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1567 #endif
1568             if (tcol) lens[row]++;
1569           }
1570         } else { /* allcolumns */
1571           lens[row] += nnz;
1572         }
1573       }
1574     }
1575     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1576     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1577 
1578     /* Create the submatrices */
1579     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1580     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1581 
1582     PetscCall(ISGetBlockSize(isrow[0], &i));
1583     PetscCall(ISGetBlockSize(iscol[0], &j));
1584     PetscCall(MatSetBlockSizes(submat, i, j));
1585     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1586     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1587 
1588     /* create struct Mat_SubSppt and attached it to submat */
1589     PetscCall(PetscNew(&smatis1));
1590     subc            = (Mat_SeqAIJ *)submat->data;
1591     subc->submatis1 = smatis1;
1592 
1593     smatis1->id          = 0;
1594     smatis1->nrqs        = nrqs;
1595     smatis1->nrqr        = nrqr;
1596     smatis1->rbuf1       = rbuf1;
1597     smatis1->rbuf2       = rbuf2;
1598     smatis1->rbuf3       = rbuf3;
1599     smatis1->sbuf2       = sbuf2;
1600     smatis1->req_source2 = req_source2;
1601 
1602     smatis1->sbuf1 = sbuf1;
1603     smatis1->ptr   = ptr;
1604     smatis1->tmp   = tmp;
1605     smatis1->ctr   = ctr;
1606 
1607     smatis1->pa          = pa;
1608     smatis1->req_size    = req_size;
1609     smatis1->req_source1 = req_source1;
1610 
1611     smatis1->allcolumns = allcolumns;
1612     smatis1->singleis   = PETSC_TRUE;
1613     smatis1->row2proc   = row2proc;
1614     smatis1->rmap       = rmap;
1615     smatis1->cmap       = cmap;
1616 #if defined(PETSC_USE_CTABLE)
1617     smatis1->rmap_loc = rmap_loc;
1618     smatis1->cmap_loc = cmap_loc;
1619 #endif
1620 
1621     smatis1->destroy     = submat->ops->destroy;
1622     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1623     submat->factortype   = C->factortype;
1624 
1625     /* compute rmax */
1626     rmax = 0;
1627     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1628 
1629   } else { /* scall == MAT_REUSE_MATRIX */
1630     submat = submats[0];
1631     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1632 
1633     subc        = (Mat_SeqAIJ *)submat->data;
1634     rmax        = subc->rmax;
1635     smatis1     = subc->submatis1;
1636     nrqs        = smatis1->nrqs;
1637     nrqr        = smatis1->nrqr;
1638     rbuf1       = smatis1->rbuf1;
1639     rbuf2       = smatis1->rbuf2;
1640     rbuf3       = smatis1->rbuf3;
1641     req_source2 = smatis1->req_source2;
1642 
1643     sbuf1 = smatis1->sbuf1;
1644     sbuf2 = smatis1->sbuf2;
1645     ptr   = smatis1->ptr;
1646     tmp   = smatis1->tmp;
1647     ctr   = smatis1->ctr;
1648 
1649     pa          = smatis1->pa;
1650     req_size    = smatis1->req_size;
1651     req_source1 = smatis1->req_source1;
1652 
1653     allcolumns = smatis1->allcolumns;
1654     row2proc   = smatis1->row2proc;
1655     rmap       = smatis1->rmap;
1656     cmap       = smatis1->cmap;
1657 #if defined(PETSC_USE_CTABLE)
1658     rmap_loc = smatis1->rmap_loc;
1659     cmap_loc = smatis1->cmap_loc;
1660 #endif
1661   }
1662 
1663   /* Post recv matrix values */
1664   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1665   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1666   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1667   for (i = 0; i < nrqs; ++i) {
1668     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1669     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1670   }
1671 
1672   /* Allocate sending buffers for a->a, and send them off */
1673   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1674   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1675   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1676   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1677 
1678   for (i = 0; i < nrqr; i++) {
1679     rbuf1_i   = rbuf1[i];
1680     sbuf_aa_i = sbuf_aa[i];
1681     ct1       = 2 * rbuf1_i[0] + 1;
1682     ct2       = 0;
1683     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1684 
1685     kmax = rbuf1_i[2];
1686     for (k = 0; k < kmax; k++, ct1++) {
1687       row    = rbuf1_i[ct1] - rstart;
1688       nzA    = ai[row + 1] - ai[row];
1689       nzB    = bi[row + 1] - bi[row];
1690       ncols  = nzA + nzB;
1691       cworkB = bj + bi[row];
1692       vworkA = a_a + ai[row];
1693       vworkB = b_a + bi[row];
1694 
1695       /* load the column values for this row into vals*/
1696       vals = sbuf_aa_i + ct2;
1697 
1698       lwrite = 0;
1699       for (l = 0; l < nzB; l++) {
1700         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1701       }
1702       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1703       for (l = 0; l < nzB; l++) {
1704         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1705       }
1706 
1707       ct2 += ncols;
1708     }
1709     PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1710   }
1711 
1712   /* Assemble submat */
1713   /* First assemble the local rows */
1714   for (j = 0; j < nrow; j++) {
1715     row  = irow[j];
1716     proc = row2proc[j];
1717     if (proc == rank) {
1718       Crow = row - rstart; /* local row index of C */
1719 #if defined(PETSC_USE_CTABLE)
1720       row = rmap_loc[Crow]; /* row index of submat */
1721 #else
1722       row = rmap[row];
1723 #endif
1724 
1725       if (allcolumns) {
1726         /* diagonal part A = c->A */
1727         ncols = ai[Crow + 1] - ai[Crow];
1728         cols  = aj + ai[Crow];
1729         vals  = a_a + ai[Crow];
1730         i     = 0;
1731         for (k = 0; k < ncols; k++) {
1732           subcols[i]   = cols[k] + cstart;
1733           subvals[i++] = vals[k];
1734         }
1735 
1736         /* off-diagonal part B = c->B */
1737         ncols = bi[Crow + 1] - bi[Crow];
1738         cols  = bj + bi[Crow];
1739         vals  = b_a + bi[Crow];
1740         for (k = 0; k < ncols; k++) {
1741           subcols[i]   = bmap[cols[k]];
1742           subvals[i++] = vals[k];
1743         }
1744 
1745         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1746 
1747       } else { /* !allcolumns */
1748 #if defined(PETSC_USE_CTABLE)
1749         /* diagonal part A = c->A */
1750         ncols = ai[Crow + 1] - ai[Crow];
1751         cols  = aj + ai[Crow];
1752         vals  = a_a + ai[Crow];
1753         i     = 0;
1754         for (k = 0; k < ncols; k++) {
1755           tcol = cmap_loc[cols[k]];
1756           if (tcol) {
1757             subcols[i]   = --tcol;
1758             subvals[i++] = vals[k];
1759           }
1760         }
1761 
1762         /* off-diagonal part B = c->B */
1763         ncols = bi[Crow + 1] - bi[Crow];
1764         cols  = bj + bi[Crow];
1765         vals  = b_a + bi[Crow];
1766         for (k = 0; k < ncols; k++) {
1767           PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol));
1768           if (tcol) {
1769             subcols[i]   = --tcol;
1770             subvals[i++] = vals[k];
1771           }
1772         }
1773 #else
1774         /* diagonal part A = c->A */
1775         ncols = ai[Crow + 1] - ai[Crow];
1776         cols  = aj + ai[Crow];
1777         vals  = a_a + ai[Crow];
1778         i     = 0;
1779         for (k = 0; k < ncols; k++) {
1780           tcol = cmap[cols[k] + cstart];
1781           if (tcol) {
1782             subcols[i]   = --tcol;
1783             subvals[i++] = vals[k];
1784           }
1785         }
1786 
1787         /* off-diagonal part B = c->B */
1788         ncols = bi[Crow + 1] - bi[Crow];
1789         cols  = bj + bi[Crow];
1790         vals  = b_a + bi[Crow];
1791         for (k = 0; k < ncols; k++) {
1792           tcol = cmap[bmap[cols[k]]];
1793           if (tcol) {
1794             subcols[i]   = --tcol;
1795             subvals[i++] = vals[k];
1796           }
1797         }
1798 #endif
1799         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1800       }
1801     }
1802   }
1803 
1804   /* Now assemble the off-proc rows */
1805   for (i = 0; i < nrqs; i++) { /* for each requested message */
1806     /* recv values from other processes */
1807     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1808     proc    = pa[idex];
1809     sbuf1_i = sbuf1[proc];
1810     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1811     ct1     = 2 + 1;
1812     ct2     = 0;           /* count of received C->j */
1813     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1814     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1815     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1816     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1817 
1818     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1819     max1 = sbuf1_i[2];                  /* num of rows */
1820     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1821       row = sbuf1_i[ct1];               /* row index of submat */
1822       if (!allcolumns) {
1823         idex = 0;
1824         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1825           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1826           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1827 #if defined(PETSC_USE_CTABLE)
1828             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1829               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1830             } else {
1831               PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol));
1832             }
1833 #else
1834             tcol = cmap[rbuf3_i[ct2]];
1835 #endif
1836             if (tcol) {
1837               subcols[idex]   = --tcol; /* may not be sorted */
1838               subvals[idex++] = rbuf4_i[ct2];
1839 
1840               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1841                For reuse, we replace received C->j with index that should be inserted to submat */
1842               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1843             }
1844           }
1845           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1846         } else { /* scall == MAT_REUSE_MATRIX */
1847           submat = submats[0];
1848           subc   = (Mat_SeqAIJ *)submat->data;
1849 
1850           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1851           for (l = 0; l < nnz; l++) {
1852             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1853             subvals[idex++] = rbuf4_i[ct2];
1854           }
1855 
1856           bj = subc->j + subc->i[row]; /* sorted column indices */
1857           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1858         }
1859       } else {              /* allcolumns */
1860         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1861         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES));
1862         ct2 += nnz;
1863       }
1864     }
1865   }
1866 
1867   /* sending a->a are done */
1868   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1869   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1870 
1871   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1872   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1873   submats[0] = submat;
1874 
1875   /* Restore the indices */
1876   PetscCall(ISRestoreIndices(isrow[0], &irow));
1877   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1878 
1879   /* Destroy allocated memory */
1880   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1881   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1882   if (sbuf_aa) {
1883     PetscCall(PetscFree(sbuf_aa[0]));
1884     PetscCall(PetscFree(sbuf_aa));
1885   }
1886 
1887   if (scall == MAT_INITIAL_MATRIX) {
1888     PetscCall(PetscFree(lens));
1889     if (sbuf_aj) {
1890       PetscCall(PetscFree(sbuf_aj[0]));
1891       PetscCall(PetscFree(sbuf_aj));
1892     }
1893   }
1894   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1895   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1896   PetscFunctionReturn(0);
1897 }
1898 
1899 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1900 {
1901   PetscInt  ncol;
1902   PetscBool colflag, allcolumns = PETSC_FALSE;
1903 
1904   PetscFunctionBegin;
1905   /* Allocate memory to hold all the submatrices */
1906   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1907 
1908   /* Check for special case: each processor gets entire matrix columns */
1909   PetscCall(ISIdentity(iscol[0], &colflag));
1910   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1911   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1912 
1913   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1914   PetscFunctionReturn(0);
1915 }
1916 
1917 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1918 {
1919   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1920   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1921   Mat_SeqAIJ  *subc;
1922   Mat_SubSppt *smat;
1923 
1924   PetscFunctionBegin;
1925   /* Check for special case: each processor has a single IS */
1926   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1927     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1928     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1929     PetscFunctionReturn(0);
1930   }
1931 
1932   /* Collect global wantallmatrix and nstages */
1933   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1934   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1935   if (!nmax) nmax = 1;
1936 
1937   if (scall == MAT_INITIAL_MATRIX) {
1938     /* Collect global wantallmatrix and nstages */
1939     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1940       PetscCall(ISIdentity(*isrow, &rowflag));
1941       PetscCall(ISIdentity(*iscol, &colflag));
1942       PetscCall(ISGetLocalSize(*isrow, &nrow));
1943       PetscCall(ISGetLocalSize(*iscol, &ncol));
1944       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1945         wantallmatrix = PETSC_TRUE;
1946 
1947         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1948       }
1949     }
1950 
1951     /* Determine the number of stages through which submatrices are done
1952        Each stage will extract nmax submatrices.
1953        nmax is determined by the matrix column dimension.
1954        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1955     */
1956     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1957 
1958     in[0] = -1 * (PetscInt)wantallmatrix;
1959     in[1] = nstages;
1960     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1961     wantallmatrix = (PetscBool)(-out[0]);
1962     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1963 
1964   } else { /* MAT_REUSE_MATRIX */
1965     if (ismax) {
1966       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
1967       smat = subc->submatis1;
1968     } else { /* (*submat)[0] is a dummy matrix */
1969       smat = (Mat_SubSppt *)(*submat)[0]->data;
1970     }
1971     if (!smat) {
1972       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1973       wantallmatrix = PETSC_TRUE;
1974     } else if (smat->singleis) {
1975       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1976       PetscFunctionReturn(0);
1977     } else {
1978       nstages = smat->nstages;
1979     }
1980   }
1981 
1982   if (wantallmatrix) {
1983     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
1984     PetscFunctionReturn(0);
1985   }
1986 
1987   /* Allocate memory to hold all the submatrices and dummy submatrices */
1988   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
1989 
1990   for (i = 0, pos = 0; i < nstages; i++) {
1991     if (pos + nmax <= ismax) max_no = nmax;
1992     else if (pos >= ismax) max_no = 0;
1993     else max_no = ismax - pos;
1994 
1995     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos));
1996     if (!max_no) {
1997       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
1998         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
1999         smat->nstages = nstages;
2000       }
2001       pos++; /* advance to next dummy matrix if any */
2002     } else pos += max_no;
2003   }
2004 
2005   if (ismax && scall == MAT_INITIAL_MATRIX) {
2006     /* save nstages for reuse */
2007     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2008     smat          = subc->submatis1;
2009     smat->nstages = nstages;
2010   }
2011   PetscFunctionReturn(0);
2012 }
2013 
2014 /* -------------------------------------------------------------------------*/
2015 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2016 {
2017   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2018   Mat              A = c->A;
2019   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2020   const PetscInt **icol, **irow;
2021   PetscInt        *nrow, *ncol, start;
2022   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2023   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2024   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2025   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2026   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2027 #if defined(PETSC_USE_CTABLE)
2028   PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i;
2029 #else
2030   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2031 #endif
2032   const PetscInt *irow_i;
2033   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2034   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2035   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2036   MPI_Comm        comm;
2037   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2038   PetscMPIInt    *onodes1, *olengths1, end;
2039   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2040   Mat_SubSppt    *smat_i;
2041   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2042   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2043 
2044   PetscFunctionBegin;
2045   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2046   size = c->size;
2047   rank = c->rank;
2048 
2049   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2050   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2051 
2052   for (i = 0; i < ismax; i++) {
2053     PetscCall(ISSorted(iscol[i], &issorted[i]));
2054     if (!issorted[i]) iscsorted = issorted[i];
2055 
2056     PetscCall(ISSorted(isrow[i], &issorted[i]));
2057 
2058     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2059     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2060 
2061     /* Check for special case: allcolumn */
2062     PetscCall(ISIdentity(iscol[i], &colflag));
2063     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2064     if (colflag && ncol[i] == C->cmap->N) {
2065       allcolumns[i] = PETSC_TRUE;
2066       icol[i]       = NULL;
2067     } else {
2068       allcolumns[i] = PETSC_FALSE;
2069       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2070     }
2071   }
2072 
2073   if (scall == MAT_REUSE_MATRIX) {
2074     /* Assumes new rows are same length as the old rows */
2075     for (i = 0; i < ismax; i++) {
2076       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2077       subc = (Mat_SeqAIJ *)submats[i]->data;
2078       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2079 
2080       /* Initial matrix as if empty */
2081       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2082 
2083       smat_i = subc->submatis1;
2084 
2085       nrqs        = smat_i->nrqs;
2086       nrqr        = smat_i->nrqr;
2087       rbuf1       = smat_i->rbuf1;
2088       rbuf2       = smat_i->rbuf2;
2089       rbuf3       = smat_i->rbuf3;
2090       req_source2 = smat_i->req_source2;
2091 
2092       sbuf1 = smat_i->sbuf1;
2093       sbuf2 = smat_i->sbuf2;
2094       ptr   = smat_i->ptr;
2095       tmp   = smat_i->tmp;
2096       ctr   = smat_i->ctr;
2097 
2098       pa          = smat_i->pa;
2099       req_size    = smat_i->req_size;
2100       req_source1 = smat_i->req_source1;
2101 
2102       allcolumns[i] = smat_i->allcolumns;
2103       row2proc[i]   = smat_i->row2proc;
2104       rmap[i]       = smat_i->rmap;
2105       cmap[i]       = smat_i->cmap;
2106     }
2107 
2108     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2109       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2110       smat_i = (Mat_SubSppt *)submats[0]->data;
2111 
2112       nrqs        = smat_i->nrqs;
2113       nrqr        = smat_i->nrqr;
2114       rbuf1       = smat_i->rbuf1;
2115       rbuf2       = smat_i->rbuf2;
2116       rbuf3       = smat_i->rbuf3;
2117       req_source2 = smat_i->req_source2;
2118 
2119       sbuf1 = smat_i->sbuf1;
2120       sbuf2 = smat_i->sbuf2;
2121       ptr   = smat_i->ptr;
2122       tmp   = smat_i->tmp;
2123       ctr   = smat_i->ctr;
2124 
2125       pa          = smat_i->pa;
2126       req_size    = smat_i->req_size;
2127       req_source1 = smat_i->req_source1;
2128 
2129       allcolumns[0] = PETSC_FALSE;
2130     }
2131   } else { /* scall == MAT_INITIAL_MATRIX */
2132     /* Get some new tags to keep the communication clean */
2133     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2134     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2135 
2136     /* evaluate communication - mesg to who, length of mesg, and buffer space
2137      required. Based on this, buffers are allocated, and data copied into them*/
2138     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2139 
2140     for (i = 0; i < ismax; i++) {
2141       jmax   = nrow[i];
2142       irow_i = irow[i];
2143 
2144       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2145       row2proc[i] = row2proc_i;
2146 
2147       if (issorted[i]) proc = 0;
2148       for (j = 0; j < jmax; j++) {
2149         if (!issorted[i]) proc = 0;
2150         row = irow_i[j];
2151         while (row >= C->rmap->range[proc + 1]) proc++;
2152         w4[proc]++;
2153         row2proc_i[j] = proc; /* map row index to proc */
2154       }
2155       for (j = 0; j < size; j++) {
2156         if (w4[j]) {
2157           w1[j] += w4[j];
2158           w3[j]++;
2159           w4[j] = 0;
2160         }
2161       }
2162     }
2163 
2164     nrqs     = 0; /* no of outgoing messages */
2165     msz      = 0; /* total mesg length (for all procs) */
2166     w1[rank] = 0; /* no mesg sent to self */
2167     w3[rank] = 0;
2168     for (i = 0; i < size; i++) {
2169       if (w1[i]) {
2170         w2[i] = 1;
2171         nrqs++;
2172       } /* there exists a message to proc i */
2173     }
2174     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2175     for (i = 0, j = 0; i < size; i++) {
2176       if (w1[i]) {
2177         pa[j] = i;
2178         j++;
2179       }
2180     }
2181 
2182     /* Each message would have a header = 1 + 2*(no of IS) + data */
2183     for (i = 0; i < nrqs; i++) {
2184       j = pa[i];
2185       w1[j] += w2[j] + 2 * w3[j];
2186       msz += w1[j];
2187     }
2188     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
2189 
2190     /* Determine the number of messages to expect, their lengths, from from-ids */
2191     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2192     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2193 
2194     /* Now post the Irecvs corresponding to these messages */
2195     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2196     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2197 
2198     /* Allocate Memory for outgoing messages */
2199     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2200     PetscCall(PetscArrayzero(sbuf1, size));
2201     PetscCall(PetscArrayzero(ptr, size));
2202 
2203     {
2204       PetscInt *iptr = tmp;
2205       k              = 0;
2206       for (i = 0; i < nrqs; i++) {
2207         j = pa[i];
2208         iptr += k;
2209         sbuf1[j] = iptr;
2210         k        = w1[j];
2211       }
2212     }
2213 
2214     /* Form the outgoing messages. Initialize the header space */
2215     for (i = 0; i < nrqs; i++) {
2216       j           = pa[i];
2217       sbuf1[j][0] = 0;
2218       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2219       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2220     }
2221 
2222     /* Parse the isrow and copy data into outbuf */
2223     for (i = 0; i < ismax; i++) {
2224       row2proc_i = row2proc[i];
2225       PetscCall(PetscArrayzero(ctr, size));
2226       irow_i = irow[i];
2227       jmax   = nrow[i];
2228       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2229         proc = row2proc_i[j];
2230         if (proc != rank) { /* copy to the outgoing buf*/
2231           ctr[proc]++;
2232           *ptr[proc] = irow_i[j];
2233           ptr[proc]++;
2234         }
2235       }
2236       /* Update the headers for the current IS */
2237       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2238         if ((ctr_j = ctr[j])) {
2239           sbuf1_j            = sbuf1[j];
2240           k                  = ++sbuf1_j[0];
2241           sbuf1_j[2 * k]     = ctr_j;
2242           sbuf1_j[2 * k - 1] = i;
2243         }
2244       }
2245     }
2246 
2247     /*  Now  post the sends */
2248     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2249     for (i = 0; i < nrqs; ++i) {
2250       j = pa[i];
2251       PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2252     }
2253 
2254     /* Post Receives to capture the buffer size */
2255     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2256     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2257     if (nrqs) rbuf2[0] = tmp + msz;
2258     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2259     for (i = 0; i < nrqs; ++i) {
2260       j = pa[i];
2261       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2262     }
2263 
2264     /* Send to other procs the buf size they should allocate */
2265     /* Receive messages*/
2266     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2267     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2268     {
2269       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2270       PetscInt *sbuf2_i;
2271 
2272       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2273       for (i = 0; i < nrqr; ++i) {
2274         req_size[i] = 0;
2275         rbuf1_i     = rbuf1[i];
2276         start       = 2 * rbuf1_i[0] + 1;
2277         end         = olengths1[i];
2278         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2279         sbuf2_i = sbuf2[i];
2280         for (j = start; j < end; j++) {
2281           id         = rbuf1_i[j] - rstart;
2282           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2283           sbuf2_i[j] = ncols;
2284           req_size[i] += ncols;
2285         }
2286         req_source1[i] = onodes1[i];
2287         /* form the header */
2288         sbuf2_i[0] = req_size[i];
2289         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2290 
2291         PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2292       }
2293     }
2294 
2295     PetscCall(PetscFree(onodes1));
2296     PetscCall(PetscFree(olengths1));
2297     PetscCall(PetscFree(r_waits1));
2298     PetscCall(PetscFree4(w1, w2, w3, w4));
2299 
2300     /* Receive messages*/
2301     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2302     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2303     for (i = 0; i < nrqs; ++i) {
2304       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2305       req_source2[i] = pa[i];
2306       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2307     }
2308     PetscCall(PetscFree(r_waits2));
2309 
2310     /* Wait on sends1 and sends2 */
2311     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2312     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2313     PetscCall(PetscFree(s_waits1));
2314     PetscCall(PetscFree(s_waits2));
2315 
2316     /* Now allocate sending buffers for a->j, and send them off */
2317     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2318     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2319     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2320     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2321 
2322     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2323     {
2324       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2325       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2326       PetscInt  cend = C->cmap->rend;
2327       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2328 
2329       for (i = 0; i < nrqr; i++) {
2330         rbuf1_i   = rbuf1[i];
2331         sbuf_aj_i = sbuf_aj[i];
2332         ct1       = 2 * rbuf1_i[0] + 1;
2333         ct2       = 0;
2334         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2335           kmax = rbuf1[i][2 * j];
2336           for (k = 0; k < kmax; k++, ct1++) {
2337             row    = rbuf1_i[ct1] - rstart;
2338             nzA    = a_i[row + 1] - a_i[row];
2339             nzB    = b_i[row + 1] - b_i[row];
2340             ncols  = nzA + nzB;
2341             cworkA = a_j + a_i[row];
2342             cworkB = b_j + b_i[row];
2343 
2344             /* load the column indices for this row into cols */
2345             cols = sbuf_aj_i + ct2;
2346 
2347             lwrite = 0;
2348             for (l = 0; l < nzB; l++) {
2349               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2350             }
2351             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2352             for (l = 0; l < nzB; l++) {
2353               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2354             }
2355 
2356             ct2 += ncols;
2357           }
2358         }
2359         PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2360       }
2361     }
2362 
2363     /* create col map: global col of C -> local col of submatrices */
2364     {
2365       const PetscInt *icol_i;
2366 #if defined(PETSC_USE_CTABLE)
2367       for (i = 0; i < ismax; i++) {
2368         if (!allcolumns[i]) {
2369           PetscCall(PetscTableCreate(ncol[i], C->cmap->N, &cmap[i]));
2370 
2371           jmax   = ncol[i];
2372           icol_i = icol[i];
2373           cmap_i = cmap[i];
2374           for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES));
2375         } else cmap[i] = NULL;
2376       }
2377 #else
2378       for (i = 0; i < ismax; i++) {
2379         if (!allcolumns[i]) {
2380           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2381           jmax   = ncol[i];
2382           icol_i = icol[i];
2383           cmap_i = cmap[i];
2384           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2385         } else cmap[i] = NULL;
2386       }
2387 #endif
2388     }
2389 
2390     /* Create lens which is required for MatCreate... */
2391     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2392     PetscCall(PetscMalloc1(ismax, &lens));
2393 
2394     if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2395     for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];
2396 
2397     /* Update lens from local data */
2398     for (i = 0; i < ismax; i++) {
2399       row2proc_i = row2proc[i];
2400       jmax       = nrow[i];
2401       if (!allcolumns[i]) cmap_i = cmap[i];
2402       irow_i = irow[i];
2403       lens_i = lens[i];
2404       for (j = 0; j < jmax; j++) {
2405         row  = irow_i[j];
2406         proc = row2proc_i[j];
2407         if (proc == rank) {
2408           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2409           if (!allcolumns[i]) {
2410             for (k = 0; k < ncols; k++) {
2411 #if defined(PETSC_USE_CTABLE)
2412               PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol));
2413 #else
2414               tcol = cmap_i[cols[k]];
2415 #endif
2416               if (tcol) lens_i[j]++;
2417             }
2418           } else { /* allcolumns */
2419             lens_i[j] = ncols;
2420           }
2421           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2422         }
2423       }
2424     }
2425 
2426     /* Create row map: global row of C -> local row of submatrices */
2427 #if defined(PETSC_USE_CTABLE)
2428     for (i = 0; i < ismax; i++) {
2429       PetscCall(PetscTableCreate(nrow[i], C->rmap->N, &rmap[i]));
2430       irow_i = irow[i];
2431       jmax   = nrow[i];
2432       for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES));
2433     }
2434 #else
2435     for (i = 0; i < ismax; i++) {
2436       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2437       rmap_i = rmap[i];
2438       irow_i = irow[i];
2439       jmax   = nrow[i];
2440       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2441     }
2442 #endif
2443 
2444     /* Update lens from offproc data */
2445     {
2446       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2447 
2448       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2449       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2450         sbuf1_i = sbuf1[pa[tmp2]];
2451         jmax    = sbuf1_i[0];
2452         ct1     = 2 * jmax + 1;
2453         ct2     = 0;
2454         rbuf2_i = rbuf2[tmp2];
2455         rbuf3_i = rbuf3[tmp2];
2456         for (j = 1; j <= jmax; j++) {
2457           is_no  = sbuf1_i[2 * j - 1];
2458           max1   = sbuf1_i[2 * j];
2459           lens_i = lens[is_no];
2460           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2461           rmap_i = rmap[is_no];
2462           for (k = 0; k < max1; k++, ct1++) {
2463 #if defined(PETSC_USE_CTABLE)
2464             PetscCall(PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row));
2465             row--;
2466             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2467 #else
2468             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2469 #endif
2470             max2 = rbuf2_i[ct1];
2471             for (l = 0; l < max2; l++, ct2++) {
2472               if (!allcolumns[is_no]) {
2473 #if defined(PETSC_USE_CTABLE)
2474                 PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol));
2475 #else
2476                 tcol = cmap_i[rbuf3_i[ct2]];
2477 #endif
2478                 if (tcol) lens_i[row]++;
2479               } else {         /* allcolumns */
2480                 lens_i[row]++; /* lens_i[row] += max2 ? */
2481               }
2482             }
2483           }
2484         }
2485       }
2486     }
2487     PetscCall(PetscFree(r_waits3));
2488     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2489     PetscCall(PetscFree(s_waits3));
2490 
2491     /* Create the submatrices */
2492     for (i = 0; i < ismax; i++) {
2493       PetscInt rbs, cbs;
2494 
2495       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2496       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2497 
2498       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2499       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2500 
2501       PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2502       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2503       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2504 
2505       /* create struct Mat_SubSppt and attached it to submat */
2506       PetscCall(PetscNew(&smat_i));
2507       subc            = (Mat_SeqAIJ *)submats[i]->data;
2508       subc->submatis1 = smat_i;
2509 
2510       smat_i->destroy          = submats[i]->ops->destroy;
2511       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2512       submats[i]->factortype   = C->factortype;
2513 
2514       smat_i->id          = i;
2515       smat_i->nrqs        = nrqs;
2516       smat_i->nrqr        = nrqr;
2517       smat_i->rbuf1       = rbuf1;
2518       smat_i->rbuf2       = rbuf2;
2519       smat_i->rbuf3       = rbuf3;
2520       smat_i->sbuf2       = sbuf2;
2521       smat_i->req_source2 = req_source2;
2522 
2523       smat_i->sbuf1 = sbuf1;
2524       smat_i->ptr   = ptr;
2525       smat_i->tmp   = tmp;
2526       smat_i->ctr   = ctr;
2527 
2528       smat_i->pa          = pa;
2529       smat_i->req_size    = req_size;
2530       smat_i->req_source1 = req_source1;
2531 
2532       smat_i->allcolumns = allcolumns[i];
2533       smat_i->singleis   = PETSC_FALSE;
2534       smat_i->row2proc   = row2proc[i];
2535       smat_i->rmap       = rmap[i];
2536       smat_i->cmap       = cmap[i];
2537     }
2538 
2539     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2540       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2541       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2542       PetscCall(MatSetType(submats[0], MATDUMMY));
2543 
2544       /* create struct Mat_SubSppt and attached it to submat */
2545       PetscCall(PetscNew(&smat_i));
2546       submats[0]->data = (void *)smat_i;
2547 
2548       smat_i->destroy          = submats[0]->ops->destroy;
2549       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2550       submats[0]->factortype   = C->factortype;
2551 
2552       smat_i->id          = 0;
2553       smat_i->nrqs        = nrqs;
2554       smat_i->nrqr        = nrqr;
2555       smat_i->rbuf1       = rbuf1;
2556       smat_i->rbuf2       = rbuf2;
2557       smat_i->rbuf3       = rbuf3;
2558       smat_i->sbuf2       = sbuf2;
2559       smat_i->req_source2 = req_source2;
2560 
2561       smat_i->sbuf1 = sbuf1;
2562       smat_i->ptr   = ptr;
2563       smat_i->tmp   = tmp;
2564       smat_i->ctr   = ctr;
2565 
2566       smat_i->pa          = pa;
2567       smat_i->req_size    = req_size;
2568       smat_i->req_source1 = req_source1;
2569 
2570       smat_i->allcolumns = PETSC_FALSE;
2571       smat_i->singleis   = PETSC_FALSE;
2572       smat_i->row2proc   = NULL;
2573       smat_i->rmap       = NULL;
2574       smat_i->cmap       = NULL;
2575     }
2576 
2577     if (ismax) PetscCall(PetscFree(lens[0]));
2578     PetscCall(PetscFree(lens));
2579     if (sbuf_aj) {
2580       PetscCall(PetscFree(sbuf_aj[0]));
2581       PetscCall(PetscFree(sbuf_aj));
2582     }
2583 
2584   } /* endof scall == MAT_INITIAL_MATRIX */
2585 
2586   /* Post recv matrix values */
2587   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2588   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2589   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2590   for (i = 0; i < nrqs; ++i) {
2591     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2592     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2593   }
2594 
2595   /* Allocate sending buffers for a->a, and send them off */
2596   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2597   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2598   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2599   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2600 
2601   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2602   {
2603     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2604     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2605     PetscInt     cend = C->cmap->rend;
2606     PetscInt    *b_j  = b->j;
2607     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2608 
2609     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2610     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2611     for (i = 0; i < nrqr; i++) {
2612       rbuf1_i   = rbuf1[i];
2613       sbuf_aa_i = sbuf_aa[i];
2614       ct1       = 2 * rbuf1_i[0] + 1;
2615       ct2       = 0;
2616       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2617         kmax = rbuf1_i[2 * j];
2618         for (k = 0; k < kmax; k++, ct1++) {
2619           row    = rbuf1_i[ct1] - rstart;
2620           nzA    = a_i[row + 1] - a_i[row];
2621           nzB    = b_i[row + 1] - b_i[row];
2622           ncols  = nzA + nzB;
2623           cworkB = b_j + b_i[row];
2624           vworkA = a_a + a_i[row];
2625           vworkB = b_a + b_i[row];
2626 
2627           /* load the column values for this row into vals*/
2628           vals = sbuf_aa_i + ct2;
2629 
2630           lwrite = 0;
2631           for (l = 0; l < nzB; l++) {
2632             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2633           }
2634           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2635           for (l = 0; l < nzB; l++) {
2636             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2637           }
2638 
2639           ct2 += ncols;
2640         }
2641       }
2642       PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2643     }
2644     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2645     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2646   }
2647 
2648   /* Assemble the matrices */
2649   /* First assemble the local rows */
2650   for (i = 0; i < ismax; i++) {
2651     row2proc_i = row2proc[i];
2652     subc       = (Mat_SeqAIJ *)submats[i]->data;
2653     imat_ilen  = subc->ilen;
2654     imat_j     = subc->j;
2655     imat_i     = subc->i;
2656     imat_a     = subc->a;
2657 
2658     if (!allcolumns[i]) cmap_i = cmap[i];
2659     rmap_i = rmap[i];
2660     irow_i = irow[i];
2661     jmax   = nrow[i];
2662     for (j = 0; j < jmax; j++) {
2663       row  = irow_i[j];
2664       proc = row2proc_i[j];
2665       if (proc == rank) {
2666         old_row = row;
2667 #if defined(PETSC_USE_CTABLE)
2668         PetscCall(PetscTableFind(rmap_i, row + 1, &row));
2669         row--;
2670 #else
2671         row = rmap_i[row];
2672 #endif
2673         ilen_row = imat_ilen[row];
2674         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2675         mat_i = imat_i[row];
2676         mat_a = imat_a + mat_i;
2677         mat_j = imat_j + mat_i;
2678         if (!allcolumns[i]) {
2679           for (k = 0; k < ncols; k++) {
2680 #if defined(PETSC_USE_CTABLE)
2681             PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol));
2682 #else
2683             tcol = cmap_i[cols[k]];
2684 #endif
2685             if (tcol) {
2686               *mat_j++ = tcol - 1;
2687               *mat_a++ = vals[k];
2688               ilen_row++;
2689             }
2690           }
2691         } else { /* allcolumns */
2692           for (k = 0; k < ncols; k++) {
2693             *mat_j++ = cols[k]; /* global col index! */
2694             *mat_a++ = vals[k];
2695             ilen_row++;
2696           }
2697         }
2698         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2699 
2700         imat_ilen[row] = ilen_row;
2701       }
2702     }
2703   }
2704 
2705   /* Now assemble the off proc rows */
2706   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2707   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2708     sbuf1_i = sbuf1[pa[tmp2]];
2709     jmax    = sbuf1_i[0];
2710     ct1     = 2 * jmax + 1;
2711     ct2     = 0;
2712     rbuf2_i = rbuf2[tmp2];
2713     rbuf3_i = rbuf3[tmp2];
2714     rbuf4_i = rbuf4[tmp2];
2715     for (j = 1; j <= jmax; j++) {
2716       is_no  = sbuf1_i[2 * j - 1];
2717       rmap_i = rmap[is_no];
2718       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2719       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2720       imat_ilen = subc->ilen;
2721       imat_j    = subc->j;
2722       imat_i    = subc->i;
2723       imat_a    = subc->a;
2724       max1      = sbuf1_i[2 * j];
2725       for (k = 0; k < max1; k++, ct1++) {
2726         row = sbuf1_i[ct1];
2727 #if defined(PETSC_USE_CTABLE)
2728         PetscCall(PetscTableFind(rmap_i, row + 1, &row));
2729         row--;
2730 #else
2731         row = rmap_i[row];
2732 #endif
2733         ilen  = imat_ilen[row];
2734         mat_i = imat_i[row];
2735         mat_a = imat_a + mat_i;
2736         mat_j = imat_j + mat_i;
2737         max2  = rbuf2_i[ct1];
2738         if (!allcolumns[is_no]) {
2739           for (l = 0; l < max2; l++, ct2++) {
2740 #if defined(PETSC_USE_CTABLE)
2741             PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol));
2742 #else
2743             tcol = cmap_i[rbuf3_i[ct2]];
2744 #endif
2745             if (tcol) {
2746               *mat_j++ = tcol - 1;
2747               *mat_a++ = rbuf4_i[ct2];
2748               ilen++;
2749             }
2750           }
2751         } else { /* allcolumns */
2752           for (l = 0; l < max2; l++, ct2++) {
2753             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2754             *mat_a++ = rbuf4_i[ct2];
2755             ilen++;
2756           }
2757         }
2758         imat_ilen[row] = ilen;
2759       }
2760     }
2761   }
2762 
2763   if (!iscsorted) { /* sort column indices of the rows */
2764     for (i = 0; i < ismax; i++) {
2765       subc      = (Mat_SeqAIJ *)submats[i]->data;
2766       imat_j    = subc->j;
2767       imat_i    = subc->i;
2768       imat_a    = subc->a;
2769       imat_ilen = subc->ilen;
2770 
2771       if (allcolumns[i]) continue;
2772       jmax = nrow[i];
2773       for (j = 0; j < jmax; j++) {
2774         mat_i = imat_i[j];
2775         mat_a = imat_a + mat_i;
2776         mat_j = imat_j + mat_i;
2777         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2778       }
2779     }
2780   }
2781 
2782   PetscCall(PetscFree(r_waits4));
2783   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2784   PetscCall(PetscFree(s_waits4));
2785 
2786   /* Restore the indices */
2787   for (i = 0; i < ismax; i++) {
2788     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2789     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2790   }
2791 
2792   for (i = 0; i < ismax; i++) {
2793     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2794     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2795   }
2796 
2797   /* Destroy allocated memory */
2798   if (sbuf_aa) {
2799     PetscCall(PetscFree(sbuf_aa[0]));
2800     PetscCall(PetscFree(sbuf_aa));
2801   }
2802   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2803 
2804   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2805   PetscCall(PetscFree(rbuf4));
2806 
2807   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2808   PetscFunctionReturn(0);
2809 }
2810 
2811 /*
2812  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2813  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2814  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2815  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2816  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2817  state, and needs to be "assembled" later by compressing B's column space.
2818 
2819  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2820  Following this call, C->A & C->B have been created, even if empty.
2821  */
2822 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2823 {
2824   /* If making this function public, change the error returned in this function away from _PLIB. */
2825   Mat_MPIAIJ     *aij;
2826   Mat_SeqAIJ     *Baij;
2827   PetscBool       seqaij, Bdisassembled;
2828   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2829   PetscScalar     v;
2830   const PetscInt *rowindices, *colindices;
2831 
2832   PetscFunctionBegin;
2833   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2834   if (A) {
2835     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2836     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2837     if (rowemb) {
2838       PetscCall(ISGetLocalSize(rowemb, &m));
2839       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2840     } else {
2841       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2842     }
2843     if (dcolemb) {
2844       PetscCall(ISGetLocalSize(dcolemb, &n));
2845       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2846     } else {
2847       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2848     }
2849   }
2850   if (B) {
2851     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2852     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2853     if (rowemb) {
2854       PetscCall(ISGetLocalSize(rowemb, &m));
2855       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2856     } else {
2857       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2858     }
2859     if (ocolemb) {
2860       PetscCall(ISGetLocalSize(ocolemb, &n));
2861       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2862     } else {
2863       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2864     }
2865   }
2866 
2867   aij = (Mat_MPIAIJ *)C->data;
2868   if (!aij->A) {
2869     /* Mimic parts of MatMPIAIJSetPreallocation() */
2870     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2871     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2872     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2873     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2874   }
2875   if (A) {
2876     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2877   } else {
2878     PetscCall(MatSetUp(aij->A));
2879   }
2880   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2881     /*
2882       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2883       need to "disassemble" B -- convert it to using C's global indices.
2884       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2885 
2886       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2887       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2888 
2889       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2890       At least avoid calling MatSetValues() and the implied searches?
2891     */
2892 
2893     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2894 #if defined(PETSC_USE_CTABLE)
2895       PetscCall(PetscTableDestroy(&aij->colmap));
2896 #else
2897       PetscCall(PetscFree(aij->colmap));
2898       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2899 #endif
2900       ngcol = 0;
2901       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2902       if (aij->garray) { PetscCall(PetscFree(aij->garray)); }
2903       PetscCall(VecDestroy(&aij->lvec));
2904       PetscCall(VecScatterDestroy(&aij->Mvctx));
2905     }
2906     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2907     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2908   }
2909   Bdisassembled = PETSC_FALSE;
2910   if (!aij->B) {
2911     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2912     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2913     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2914     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2915     Bdisassembled = PETSC_TRUE;
2916   }
2917   if (B) {
2918     Baij = (Mat_SeqAIJ *)B->data;
2919     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2920       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2921       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2922       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2923       PetscCall(PetscFree(nz));
2924     }
2925 
2926     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2927     shift      = rend - rstart;
2928     count      = 0;
2929     rowindices = NULL;
2930     colindices = NULL;
2931     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2932     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2933     for (i = 0; i < B->rmap->n; i++) {
2934       PetscInt row;
2935       row = i;
2936       if (rowindices) row = rowindices[i];
2937       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2938         col = Baij->j[count];
2939         if (colindices) col = colindices[col];
2940         if (Bdisassembled && col >= rstart) col += shift;
2941         v = Baij->a[count];
2942         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2943         ++count;
2944       }
2945     }
2946     /* No assembly for aij->B is necessary. */
2947     /* FIXME: set aij->B's nonzerostate correctly. */
2948   } else {
2949     PetscCall(MatSetUp(aij->B));
2950   }
2951   C->preallocated  = PETSC_TRUE;
2952   C->was_assembled = PETSC_FALSE;
2953   C->assembled     = PETSC_FALSE;
2954   /*
2955       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2956       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2957    */
2958   PetscFunctionReturn(0);
2959 }
2960 
2961 /*
2962   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2963  */
2964 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2965 {
2966   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
2967 
2968   PetscFunctionBegin;
2969   PetscValidPointer(A, 2);
2970   PetscValidPointer(B, 3);
2971   /* FIXME: make sure C is assembled */
2972   *A = aij->A;
2973   *B = aij->B;
2974   /* Note that we don't incref *A and *B, so be careful! */
2975   PetscFunctionReturn(0);
2976 }
2977 
2978 /*
2979   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2980   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2981 */
2982 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
2983 {
2984   PetscMPIInt size, flag;
2985   PetscInt    i, ii, cismax, ispar;
2986   Mat        *A, *B;
2987   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
2988 
2989   PetscFunctionBegin;
2990   if (!ismax) PetscFunctionReturn(0);
2991 
2992   for (i = 0, cismax = 0; i < ismax; ++i) {
2993     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
2994     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2995     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
2996     if (size > 1) ++cismax;
2997   }
2998 
2999   /*
3000      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3001      ispar counts the number of parallel ISs across C's comm.
3002   */
3003   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3004   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3005     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3006     PetscFunctionReturn(0);
3007   }
3008 
3009   /* if (ispar) */
3010   /*
3011     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3012     These are used to extract the off-diag portion of the resulting parallel matrix.
3013     The row IS for the off-diag portion is the same as for the diag portion,
3014     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3015   */
3016   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3017   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3018   for (i = 0, ii = 0; i < ismax; ++i) {
3019     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3020     if (size > 1) {
3021       /*
3022          TODO: This is the part that's ***NOT SCALABLE***.
3023          To fix this we need to extract just the indices of C's nonzero columns
3024          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3025          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3026          to be done without serializing on the IS list, so, most likely, it is best
3027          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3028       */
3029       PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii])));
3030       /* Now we have to
3031          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3032              were sorted on each rank, concatenated they might no longer be sorted;
3033          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3034              indices in the nondecreasing order to the original index positions.
3035          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3036       */
3037       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3038       PetscCall(ISSort(ciscol[ii]));
3039       ++ii;
3040     }
3041   }
3042   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3043   for (i = 0, ii = 0; i < ismax; ++i) {
3044     PetscInt        j, issize;
3045     const PetscInt *indices;
3046 
3047     /*
3048        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3049      */
3050     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3051     PetscCall(ISSort(isrow[i]));
3052     PetscCall(ISGetLocalSize(isrow[i], &issize));
3053     PetscCall(ISGetIndices(isrow[i], &indices));
3054     for (j = 1; j < issize; ++j) {
3055       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3056     }
3057     PetscCall(ISRestoreIndices(isrow[i], &indices));
3058     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3059     PetscCall(ISSort(iscol[i]));
3060     PetscCall(ISGetLocalSize(iscol[i], &issize));
3061     PetscCall(ISGetIndices(iscol[i], &indices));
3062     for (j = 1; j < issize; ++j) {
3063       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3064     }
3065     PetscCall(ISRestoreIndices(iscol[i], &indices));
3066     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3067     if (size > 1) {
3068       cisrow[ii] = isrow[i];
3069       ++ii;
3070     }
3071   }
3072   /*
3073     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3074     array of sequential matrices underlying the resulting parallel matrices.
3075     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3076     contain duplicates.
3077 
3078     There are as many diag matrices as there are original index sets. There are only as many parallel
3079     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3080 
3081     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3082     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3083       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3084       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3085       with A[i] and B[ii] extracted from the corresponding MPI submat.
3086     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3087       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3088       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3089       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3090       values into A[i] and B[ii] sitting inside the corresponding submat.
3091     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3092       A[i], B[ii], AA[i] or BB[ii] matrices.
3093   */
3094   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3095   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3096 
3097   /* Now obtain the sequential A and B submatrices separately. */
3098   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3099   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3100   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3101 
3102   /*
3103     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3104     matrices A & B have been extracted directly into the parallel matrices containing them, or
3105     simply into the sequential matrix identical with the corresponding A (if size == 1).
3106     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3107     to have the same sparsity pattern.
3108     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3109     must be constructed for C. This is done by setseqmat(s).
3110   */
3111   for (i = 0, ii = 0; i < ismax; ++i) {
3112     /*
3113        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3114        That way we can avoid sorting and computing permutations when reusing.
3115        To this end:
3116         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3117         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3118           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3119     */
3120     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3121 
3122     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3123     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3124     if (size > 1) {
3125       if (scall == MAT_INITIAL_MATRIX) {
3126         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3127         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3128         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3129         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3130         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3131       }
3132       /*
3133         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3134       */
3135       {
3136         Mat AA = A[i], BB = B[ii];
3137 
3138         if (AA || BB) {
3139           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3140           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3141           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3142         }
3143         PetscCall(MatDestroy(&AA));
3144       }
3145       PetscCall(ISDestroy(ciscol + ii));
3146       PetscCall(ISDestroy(ciscol_p + ii));
3147       ++ii;
3148     } else { /* if (size == 1) */
3149       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3150       if (isrow_p[i] || iscol_p[i]) {
3151         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3152         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3153         /* Otherwise A is extracted straight into (*submats)[i]. */
3154         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3155         PetscCall(MatDestroy(A + i));
3156       } else (*submat)[i] = A[i];
3157     }
3158     PetscCall(ISDestroy(&isrow_p[i]));
3159     PetscCall(ISDestroy(&iscol_p[i]));
3160   }
3161   PetscCall(PetscFree2(cisrow, ciscol));
3162   PetscCall(PetscFree2(isrow_p, iscol_p));
3163   PetscCall(PetscFree(ciscol_p));
3164   PetscCall(PetscFree(A));
3165   PetscCall(MatDestroySubMatrices(cismax, &B));
3166   PetscFunctionReturn(0);
3167 }
3168 
3169 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3170 {
3171   PetscFunctionBegin;
3172   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3173   PetscFunctionReturn(0);
3174 }
3175