xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) {
22   PetscInt i;
23 
24   PetscFunctionBegin;
25   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
26   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
27   PetscFunctionReturn(0);
28 }
29 
30 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) {
31   PetscInt i;
32 
33   PetscFunctionBegin;
34   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
35   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
36   PetscFunctionReturn(0);
37 }
38 
39 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) {
40   MPI_Comm        comm;
41   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
42   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
43   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
44   const PetscInt *indices_i, **indices;
45   PetscLayout     rmap;
46   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
47   PetscSF         sf;
48   PetscSFNode    *remote;
49 
50   PetscFunctionBegin;
51   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
52   PetscCallMPI(MPI_Comm_rank(comm, &rank));
53   PetscCallMPI(MPI_Comm_size(comm, &size));
54   /* get row map to determine where rows should be going */
55   PetscCall(MatGetLayouts(mat, &rmap, NULL));
56   /* retrieve IS data and put all together so that we
57    * can optimize communication
58    *  */
59   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
60   for (i = 0, tlength = 0; i < nidx; i++) {
61     PetscCall(ISGetLocalSize(is[i], &length[i]));
62     tlength += length[i];
63     PetscCall(ISGetIndices(is[i], &indices[i]));
64   }
65   /* find these rows on remote processors */
66   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
67   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
68   nrrows = 0;
69   for (i = 0; i < nidx; i++) {
70     length_i  = length[i];
71     indices_i = indices[i];
72     for (j = 0; j < length_i; j++) {
73       owner = -1;
74       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
75       /* remote processors */
76       if (owner != rank) {
77         tosizes_temp[owner]++;               /* number of rows to owner */
78         rrow_ranks[nrrows]   = owner;        /* processor */
79         rrow_isids[nrrows]   = i;            /* is id */
80         remoterows[nrrows++] = indices_i[j]; /* row */
81       }
82     }
83     PetscCall(ISRestoreIndices(is[i], &indices[i]));
84   }
85   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
86   /* test if we need to exchange messages
87    * generally speaking, we do not need to exchange
88    * data when overlap is 1
89    * */
90   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
91   /* we do not have any messages
92    * It usually corresponds to overlap 1
93    * */
94   if (!reducednrrows) {
95     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
96     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
97     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
98     PetscFunctionReturn(0);
99   }
100   nto = 0;
101   /* send sizes and ranks for building a two-sided communcation */
102   for (i = 0; i < size; i++) {
103     if (tosizes_temp[i]) {
104       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
105       tosizes_temp[i]  = nto;                 /* a map from processor to index */
106       toranks[nto++]   = i;                   /* processor */
107     }
108   }
109   PetscCall(PetscMalloc1(nto + 1, &toffsets));
110   toffsets[0] = 0;
111   for (i = 0; i < nto; i++) {
112     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
113     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
114   }
115   /* send information to other processors */
116   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
117   nrecvrows = 0;
118   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
119   PetscCall(PetscMalloc1(nrecvrows, &remote));
120   nrecvrows = 0;
121   for (i = 0; i < nfrom; i++) {
122     for (j = 0; j < fromsizes[2 * i]; j++) {
123       remote[nrecvrows].rank    = fromranks[i];
124       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
125     }
126   }
127   PetscCall(PetscSFCreate(comm, &sf));
128   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
129   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
130   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
131   PetscCall(PetscSFSetFromOptions(sf));
132   /* message pair <no of is, row>  */
133   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
134   for (i = 0; i < nrrows; i++) {
135     owner                 = rrow_ranks[i];       /* processor */
136     j                     = tosizes_temp[owner]; /* index */
137     todata[toffsets[j]++] = rrow_isids[i];
138     todata[toffsets[j]++] = remoterows[i];
139   }
140   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
141   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
142   PetscCall(PetscFree(toffsets));
143   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
144   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
145   PetscCall(PetscSFDestroy(&sf));
146   /* send rows belonging to the remote so that then we could get the overlapping data back */
147   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
148   PetscCall(PetscFree2(todata, fromdata));
149   PetscCall(PetscFree(fromsizes));
150   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
151   PetscCall(PetscFree(fromranks));
152   nrecvrows = 0;
153   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
154   PetscCall(PetscCalloc1(nrecvrows, &todata));
155   PetscCall(PetscMalloc1(nrecvrows, &remote));
156   nrecvrows = 0;
157   for (i = 0; i < nto; i++) {
158     for (j = 0; j < tosizes[2 * i]; j++) {
159       remote[nrecvrows].rank    = toranks[i];
160       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
161     }
162   }
163   PetscCall(PetscSFCreate(comm, &sf));
164   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
165   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
166   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
167   PetscCall(PetscSFSetFromOptions(sf));
168   /* overlap communication and computation */
169   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
170   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
171   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
172   PetscCall(PetscSFDestroy(&sf));
173   PetscCall(PetscFree2(sbdata, sbsizes));
174   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
175   PetscCall(PetscFree(toranks));
176   PetscCall(PetscFree(tosizes));
177   PetscCall(PetscFree(todata));
178   PetscFunctionReturn(0);
179 }
180 
181 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) {
182   PetscInt       *isz, isz_i, i, j, is_id, data_size;
183   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
184   const PetscInt *indices_i_temp;
185   MPI_Comm       *iscomms;
186 
187   PetscFunctionBegin;
188   max_lsize = 0;
189   PetscCall(PetscMalloc1(nidx, &isz));
190   for (i = 0; i < nidx; i++) {
191     PetscCall(ISGetLocalSize(is[i], &lsize));
192     max_lsize = lsize > max_lsize ? lsize : max_lsize;
193     isz[i]    = lsize;
194   }
195   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
196   for (i = 0; i < nidx; i++) {
197     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
198     PetscCall(ISGetIndices(is[i], &indices_i_temp));
199     PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]));
200     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
201     PetscCall(ISDestroy(&is[i]));
202   }
203   /* retrieve information to get row id and its overlap */
204   for (i = 0; i < nrecvs;) {
205     is_id     = recvdata[i++];
206     data_size = recvdata[i++];
207     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
208     isz_i     = isz[is_id];
209     for (j = 0; j < data_size; j++) {
210       col                = recvdata[i++];
211       indices_i[isz_i++] = col;
212     }
213     isz[is_id] = isz_i;
214   }
215   /* remove duplicate entities */
216   for (i = 0; i < nidx; i++) {
217     indices_i = indices_temp + (max_lsize + nrecvs) * i;
218     isz_i     = isz[i];
219     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
220     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
221     PetscCall(PetscCommDestroy(&iscomms[i]));
222   }
223   PetscCall(PetscFree(isz));
224   PetscCall(PetscFree2(indices_temp, iscomms));
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) {
229   PetscLayout     rmap, cmap;
230   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
231   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
232   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
233   const PetscInt *gcols, *ai, *aj, *bi, *bj;
234   Mat             amat, bmat;
235   PetscMPIInt     rank;
236   PetscBool       done;
237   MPI_Comm        comm;
238 
239   PetscFunctionBegin;
240   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
241   PetscCallMPI(MPI_Comm_rank(comm, &rank));
242   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
243   /* Even if the mat is symmetric, we still assume it is not symmetric */
244   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
245   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
246   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
247   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
248   /* total number of nonzero values is used to estimate the memory usage in the next step */
249   tnz = ai[an] + bi[bn];
250   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
251   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
252   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
253   /* to find the longest message */
254   max_fszs = 0;
255   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
256   /* better way to estimate number of nonzero in the mat??? */
257   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
258   for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
259   rows_pos  = 0;
260   totalrows = 0;
261   for (i = 0; i < nfrom; i++) {
262     PetscCall(PetscArrayzero(rows_pos_i, nidx));
263     /* group data together */
264     for (j = 0; j < fromsizes[2 * i]; j += 2) {
265       is_id                       = fromrows[rows_pos++]; /* no of is */
266       rows_i                      = rows_data[is_id];
267       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
268     }
269     /* estimate a space to avoid multiple allocations  */
270     for (j = 0; j < nidx; j++) {
271       indvc_ij = 0;
272       rows_i   = rows_data[j];
273       for (l = 0; l < rows_pos_i[j]; l++) {
274         row   = rows_i[l] - rstart;
275         start = ai[row];
276         end   = ai[row + 1];
277         for (k = start; k < end; k++) { /* Amat */
278           col                     = aj[k] + cstart;
279           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
280         }
281         start = bi[row];
282         end   = bi[row + 1];
283         for (k = start; k < end; k++) { /* Bmat */
284           col                     = gcols[bj[k]];
285           indices_tmp[indvc_ij++] = col;
286         }
287       }
288       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
289       indv_counts[i * nidx + j] = indvc_ij;
290       totalrows += indvc_ij;
291     }
292   }
293   /* message triple <no of is, number of rows, rows> */
294   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
295   totalrows = 0;
296   rows_pos  = 0;
297   /* use this code again */
298   for (i = 0; i < nfrom; i++) {
299     PetscCall(PetscArrayzero(rows_pos_i, nidx));
300     for (j = 0; j < fromsizes[2 * i]; j += 2) {
301       is_id                       = fromrows[rows_pos++];
302       rows_i                      = rows_data[is_id];
303       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
304     }
305     /* add data  */
306     for (j = 0; j < nidx; j++) {
307       if (!indv_counts[i * nidx + j]) continue;
308       indvc_ij            = 0;
309       sbdata[totalrows++] = j;
310       sbdata[totalrows++] = indv_counts[i * nidx + j];
311       sbsizes[2 * i] += 2;
312       rows_i = rows_data[j];
313       for (l = 0; l < rows_pos_i[j]; l++) {
314         row   = rows_i[l] - rstart;
315         start = ai[row];
316         end   = ai[row + 1];
317         for (k = start; k < end; k++) { /* Amat */
318           col                     = aj[k] + cstart;
319           indices_tmp[indvc_ij++] = col;
320         }
321         start = bi[row];
322         end   = bi[row + 1];
323         for (k = start; k < end; k++) { /* Bmat */
324           col                     = gcols[bj[k]];
325           indices_tmp[indvc_ij++] = col;
326         }
327       }
328       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
329       sbsizes[2 * i] += indvc_ij;
330       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
331       totalrows += indvc_ij;
332     }
333   }
334   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
335   offsets[0] = 0;
336   for (i = 0; i < nfrom; i++) {
337     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
338     sbsizes[2 * i + 1] = offsets[i];
339   }
340   PetscCall(PetscFree(offsets));
341   if (sbrowsizes) *sbrowsizes = sbsizes;
342   if (sbrows) *sbrows = sbdata;
343   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
344   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
345   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
346   PetscFunctionReturn(0);
347 }
348 
349 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) {
350   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
351   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
352   PetscInt        lsize, lsize_tmp;
353   PetscMPIInt     rank, owner;
354   Mat             amat, bmat;
355   PetscBool       done;
356   PetscLayout     cmap, rmap;
357   MPI_Comm        comm;
358 
359   PetscFunctionBegin;
360   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
361   PetscCallMPI(MPI_Comm_rank(comm, &rank));
362   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
363   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
364   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
365   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
366   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
367   /* is it a safe way to compute number of nonzero values ? */
368   tnz = ai[an] + bi[bn];
369   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
370   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
371   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
372   /* it is a better way to estimate memory than the old implementation
373    * where global size of matrix is used
374    * */
375   PetscCall(PetscMalloc1(tnz, &indices_temp));
376   for (i = 0; i < nidx; i++) {
377     MPI_Comm iscomm;
378 
379     PetscCall(ISGetLocalSize(is[i], &lsize));
380     PetscCall(ISGetIndices(is[i], &indices));
381     lsize_tmp = 0;
382     for (j = 0; j < lsize; j++) {
383       owner = -1;
384       row   = indices[j];
385       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
386       if (owner != rank) continue;
387       /* local number */
388       row -= rstart;
389       start = ai[row];
390       end   = ai[row + 1];
391       for (k = start; k < end; k++) { /* Amat */
392         col                       = aj[k] + cstart;
393         indices_temp[lsize_tmp++] = col;
394       }
395       start = bi[row];
396       end   = bi[row + 1];
397       for (k = start; k < end; k++) { /* Bmat */
398         col                       = gcols[bj[k]];
399         indices_temp[lsize_tmp++] = col;
400       }
401     }
402     PetscCall(ISRestoreIndices(is[i], &indices));
403     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
404     PetscCall(ISDestroy(&is[i]));
405     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
406     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
407     PetscCall(PetscCommDestroy(&iscomm));
408   }
409   PetscCall(PetscFree(indices_temp));
410   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
411   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
412   PetscFunctionReturn(0);
413 }
414 
415 /*
416   Sample message format:
417   If a processor A wants processor B to process some elements corresponding
418   to index sets is[1],is[5]
419   mesg [0] = 2   (no of index sets in the mesg)
420   -----------
421   mesg [1] = 1 => is[1]
422   mesg [2] = sizeof(is[1]);
423   -----------
424   mesg [3] = 5  => is[5]
425   mesg [4] = sizeof(is[5]);
426   -----------
427   mesg [5]
428   mesg [n]  datas[1]
429   -----------
430   mesg[n+1]
431   mesg[m]  data(is[5])
432   -----------
433 
434   Notes:
435   nrqs - no of requests sent (or to be sent out)
436   nrqr - no of requests received (which have to be or which have been processed)
437 */
438 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) {
439   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
440   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
441   const PetscInt **idx, *idx_i;
442   PetscInt        *n, **data, len;
443 #if defined(PETSC_USE_CTABLE)
444   PetscTable *table_data, table_data_i;
445   PetscInt   *tdata, tcount, tcount_max;
446 #else
447   PetscInt *data_i, *d_p;
448 #endif
449   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
450   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
451   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
452   PetscBT     *table;
453   MPI_Comm     comm;
454   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
455   MPI_Status  *recv_status;
456   MPI_Comm    *iscomms;
457   char        *t_p;
458 
459   PetscFunctionBegin;
460   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
461   size = c->size;
462   rank = c->rank;
463   M    = C->rmap->N;
464 
465   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
466   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
467 
468   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
469 
470   for (i = 0; i < imax; i++) {
471     PetscCall(ISGetIndices(is[i], &idx[i]));
472     PetscCall(ISGetLocalSize(is[i], &n[i]));
473   }
474 
475   /* evaluate communication - mesg to who,length of mesg, and buffer space
476      required. Based on this, buffers are allocated, and data copied into them  */
477   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
478   for (i = 0; i < imax; i++) {
479     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
480     idx_i = idx[i];
481     len   = n[i];
482     for (j = 0; j < len; j++) {
483       row = idx_i[j];
484       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
485       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
486       w4[proc]++;
487     }
488     for (j = 0; j < size; j++) {
489       if (w4[j]) {
490         w1[j] += w4[j];
491         w3[j]++;
492       }
493     }
494   }
495 
496   nrqs     = 0; /* no of outgoing messages */
497   msz      = 0; /* total mesg length (for all proc */
498   w1[rank] = 0; /* no mesg sent to intself */
499   w3[rank] = 0;
500   for (i = 0; i < size; i++) {
501     if (w1[i]) {
502       w2[i] = 1;
503       nrqs++;
504     } /* there exists a message to proc i */
505   }
506   /* pa - is list of processors to communicate with */
507   PetscCall(PetscMalloc1(nrqs, &pa));
508   for (i = 0, j = 0; i < size; i++) {
509     if (w1[i]) {
510       pa[j] = i;
511       j++;
512     }
513   }
514 
515   /* Each message would have a header = 1 + 2*(no of IS) + data */
516   for (i = 0; i < nrqs; i++) {
517     j = pa[i];
518     w1[j] += w2[j] + 2 * w3[j];
519     msz += w1[j];
520   }
521 
522   /* Determine the number of messages to expect, their lengths, from from-ids */
523   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
524   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
525 
526   /* Now post the Irecvs corresponding to these messages */
527   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
528 
529   /* Allocate Memory for outgoing messages */
530   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
531   PetscCall(PetscArrayzero(outdat, size));
532   PetscCall(PetscArrayzero(ptr, size));
533 
534   {
535     PetscInt *iptr = tmp, ict = 0;
536     for (i = 0; i < nrqs; i++) {
537       j = pa[i];
538       iptr += ict;
539       outdat[j] = iptr;
540       ict       = w1[j];
541     }
542   }
543 
544   /* Form the outgoing messages */
545   /* plug in the headers */
546   for (i = 0; i < nrqs; i++) {
547     j            = pa[i];
548     outdat[j][0] = 0;
549     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
550     ptr[j] = outdat[j] + 2 * w3[j] + 1;
551   }
552 
553   /* Memory for doing local proc's work */
554   {
555     PetscInt M_BPB_imax = 0;
556 #if defined(PETSC_USE_CTABLE)
557     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
558     PetscCall(PetscMalloc1(imax, &table_data));
559     for (i = 0; i < imax; i++) PetscCall(PetscTableCreate(n[i], M, &table_data[i]));
560     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
561     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
562 #else
563     PetscInt Mimax = 0;
564     PetscCall(PetscIntMultError(M, imax, &Mimax));
565     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
566     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
567     for (i = 0; i < imax; i++) {
568       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
569       data[i]  = d_p + M * i;
570     }
571 #endif
572   }
573 
574   /* Parse the IS and update local tables and the outgoing buf with the data */
575   {
576     PetscInt n_i, isz_i, *outdat_j, ctr_j;
577     PetscBT  table_i;
578 
579     for (i = 0; i < imax; i++) {
580       PetscCall(PetscArrayzero(ctr, size));
581       n_i     = n[i];
582       table_i = table[i];
583       idx_i   = idx[i];
584 #if defined(PETSC_USE_CTABLE)
585       table_data_i = table_data[i];
586 #else
587       data_i = data[i];
588 #endif
589       isz_i = isz[i];
590       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
591         row = idx_i[j];
592         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
593         if (proc != rank) { /* copy to the outgoing buffer */
594           ctr[proc]++;
595           *ptr[proc] = row;
596           ptr[proc]++;
597         } else if (!PetscBTLookupSet(table_i, row)) {
598 #if defined(PETSC_USE_CTABLE)
599           PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES));
600 #else
601           data_i[isz_i] = row; /* Update the local table */
602 #endif
603           isz_i++;
604         }
605       }
606       /* Update the headers for the current IS */
607       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
608         if ((ctr_j = ctr[j])) {
609           outdat_j            = outdat[j];
610           k                   = ++outdat_j[0];
611           outdat_j[2 * k]     = ctr_j;
612           outdat_j[2 * k - 1] = i;
613         }
614       }
615       isz[i] = isz_i;
616     }
617   }
618 
619   /*  Now  post the sends */
620   PetscCall(PetscMalloc1(nrqs, &s_waits1));
621   for (i = 0; i < nrqs; ++i) {
622     j = pa[i];
623     PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
624   }
625 
626   /* No longer need the original indices */
627   PetscCall(PetscMalloc1(imax, &iscomms));
628   for (i = 0; i < imax; ++i) {
629     PetscCall(ISRestoreIndices(is[i], idx + i));
630     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
631   }
632   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
633 
634   for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
635 
636     /* Do Local work */
637 #if defined(PETSC_USE_CTABLE)
638   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
639 #else
640   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
641 #endif
642 
643   /* Receive messages */
644   PetscCall(PetscMalloc1(nrqr, &recv_status));
645   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
646   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
647 
648   /* Phase 1 sends are complete - deallocate buffers */
649   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
650   PetscCall(PetscFree4(w1, w2, w3, w4));
651 
652   PetscCall(PetscMalloc1(nrqr, &xdata));
653   PetscCall(PetscMalloc1(nrqr, &isz1));
654   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
655   PetscCall(PetscFree(rbuf[0]));
656   PetscCall(PetscFree(rbuf));
657 
658   /* Send the data back */
659   /* Do a global reduction to know the buffer space req for incoming messages */
660   {
661     PetscMPIInt *rw1;
662 
663     PetscCall(PetscCalloc1(size, &rw1));
664 
665     for (i = 0; i < nrqr; ++i) {
666       proc = recv_status[i].MPI_SOURCE;
667 
668       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
669       rw1[proc] = isz1[i];
670     }
671     PetscCall(PetscFree(onodes1));
672     PetscCall(PetscFree(olengths1));
673 
674     /* Determine the number of messages to expect, their lengths, from from-ids */
675     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
676     PetscCall(PetscFree(rw1));
677   }
678   /* Now post the Irecvs corresponding to these messages */
679   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
680 
681   /* Now  post the sends */
682   PetscCall(PetscMalloc1(nrqr, &s_waits2));
683   for (i = 0; i < nrqr; ++i) {
684     j = recv_status[i].MPI_SOURCE;
685     PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
686   }
687 
688   /* receive work done on other processors */
689   {
690     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
691     PetscMPIInt idex;
692     PetscBT     table_i;
693 
694     for (i = 0; i < nrqs; ++i) {
695       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
696       /* Process the message */
697       rbuf2_i = rbuf2[idex];
698       ct1     = 2 * rbuf2_i[0] + 1;
699       jmax    = rbuf2[idex][0];
700       for (j = 1; j <= jmax; j++) {
701         max     = rbuf2_i[2 * j];
702         is_no   = rbuf2_i[2 * j - 1];
703         isz_i   = isz[is_no];
704         table_i = table[is_no];
705 #if defined(PETSC_USE_CTABLE)
706         table_data_i = table_data[is_no];
707 #else
708         data_i = data[is_no];
709 #endif
710         for (k = 0; k < max; k++, ct1++) {
711           row = rbuf2_i[ct1];
712           if (!PetscBTLookupSet(table_i, row)) {
713 #if defined(PETSC_USE_CTABLE)
714             PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES));
715 #else
716             data_i[isz_i] = row;
717 #endif
718             isz_i++;
719           }
720         }
721         isz[is_no] = isz_i;
722       }
723     }
724 
725     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
726   }
727 
728 #if defined(PETSC_USE_CTABLE)
729   tcount_max = 0;
730   for (i = 0; i < imax; ++i) {
731     table_data_i = table_data[i];
732     PetscCall(PetscTableGetCount(table_data_i, &tcount));
733     if (tcount_max < tcount) tcount_max = tcount;
734   }
735   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
736 #endif
737 
738   for (i = 0; i < imax; ++i) {
739 #if defined(PETSC_USE_CTABLE)
740     PetscTablePosition tpos;
741     table_data_i = table_data[i];
742 
743     PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos));
744     while (tpos) {
745       PetscCall(PetscTableGetNext(table_data_i, &tpos, &k, &j));
746       tdata[--j] = --k;
747     }
748     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
749 #else
750     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
751 #endif
752     PetscCall(PetscCommDestroy(&iscomms[i]));
753   }
754 
755   PetscCall(PetscFree(iscomms));
756   PetscCall(PetscFree(onodes2));
757   PetscCall(PetscFree(olengths2));
758 
759   PetscCall(PetscFree(pa));
760   PetscCall(PetscFree(rbuf2[0]));
761   PetscCall(PetscFree(rbuf2));
762   PetscCall(PetscFree(s_waits1));
763   PetscCall(PetscFree(r_waits1));
764   PetscCall(PetscFree(s_waits2));
765   PetscCall(PetscFree(r_waits2));
766   PetscCall(PetscFree(recv_status));
767   if (xdata) {
768     PetscCall(PetscFree(xdata[0]));
769     PetscCall(PetscFree(xdata));
770   }
771   PetscCall(PetscFree(isz1));
772 #if defined(PETSC_USE_CTABLE)
773   for (i = 0; i < imax; i++) PetscCall(PetscTableDestroy((PetscTable *)&table_data[i]));
774   PetscCall(PetscFree(table_data));
775   PetscCall(PetscFree(tdata));
776   PetscCall(PetscFree4(table, data, isz, t_p));
777 #else
778   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
779 #endif
780   PetscFunctionReturn(0);
781 }
782 
783 /*
784    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
785        the work on the local processor.
786 
787      Inputs:
788       C      - MAT_MPIAIJ;
789       imax - total no of index sets processed at a time;
790       table  - an array of char - size = m bits.
791 
792      Output:
793       isz    - array containing the count of the solution elements corresponding
794                to each index set;
795       data or table_data  - pointer to the solutions
796 */
797 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data) {
798   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
799   Mat         A = c->A, B = c->B;
800   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
801   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
802   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
803   PetscBT     table_i;
804 #if defined(PETSC_USE_CTABLE)
805   PetscTable         table_data_i;
806   PetscTablePosition tpos;
807   PetscInt           tcount, *tdata;
808 #else
809   PetscInt *data_i;
810 #endif
811 
812   PetscFunctionBegin;
813   rstart = C->rmap->rstart;
814   cstart = C->cmap->rstart;
815   ai     = a->i;
816   aj     = a->j;
817   bi     = b->i;
818   bj     = b->j;
819   garray = c->garray;
820 
821   for (i = 0; i < imax; i++) {
822 #if defined(PETSC_USE_CTABLE)
823     /* copy existing entries of table_data_i into tdata[] */
824     table_data_i = table_data[i];
825     PetscCall(PetscTableGetCount(table_data_i, &tcount));
826     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
827 
828     PetscCall(PetscMalloc1(tcount, &tdata));
829     PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos));
830     while (tpos) {
831       PetscCall(PetscTableGetNext(table_data_i, &tpos, &row, &j));
832       tdata[--j] = --row;
833       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
834     }
835 #else
836     data_i = data[i];
837 #endif
838     table_i = table[i];
839     isz_i   = isz[i];
840     max     = isz[i];
841 
842     for (j = 0; j < max; j++) {
843 #if defined(PETSC_USE_CTABLE)
844       row = tdata[j] - rstart;
845 #else
846       row = data_i[j] - rstart;
847 #endif
848       start = ai[row];
849       end   = ai[row + 1];
850       for (k = start; k < end; k++) { /* Amat */
851         val = aj[k] + cstart;
852         if (!PetscBTLookupSet(table_i, val)) {
853 #if defined(PETSC_USE_CTABLE)
854           PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES));
855 #else
856           data_i[isz_i] = val;
857 #endif
858           isz_i++;
859         }
860       }
861       start = bi[row];
862       end   = bi[row + 1];
863       for (k = start; k < end; k++) { /* Bmat */
864         val = garray[bj[k]];
865         if (!PetscBTLookupSet(table_i, val)) {
866 #if defined(PETSC_USE_CTABLE)
867           PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES));
868 #else
869           data_i[isz_i] = val;
870 #endif
871           isz_i++;
872         }
873       }
874     }
875     isz[i] = isz_i;
876 
877 #if defined(PETSC_USE_CTABLE)
878     PetscCall(PetscFree(tdata));
879 #endif
880   }
881   PetscFunctionReturn(0);
882 }
883 
884 /*
885       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
886          and return the output
887 
888          Input:
889            C    - the matrix
890            nrqr - no of messages being processed.
891            rbuf - an array of pointers to the received requests
892 
893          Output:
894            xdata - array of messages to be sent back
895            isz1  - size of each message
896 
897   For better efficiency perhaps we should malloc separately each xdata[i],
898 then if a remalloc is required we need only copy the data for that one row
899 rather then all previous rows as it is now where a single large chunk of
900 memory is used.
901 
902 */
903 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) {
904   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
905   Mat         A = c->A, B = c->B;
906   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
907   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
908   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
909   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
910   PetscInt   *rbuf_i, kmax, rbuf_0;
911   PetscBT     xtable;
912 
913   PetscFunctionBegin;
914   m      = C->rmap->N;
915   rstart = C->rmap->rstart;
916   cstart = C->cmap->rstart;
917   ai     = a->i;
918   aj     = a->j;
919   bi     = b->i;
920   bj     = b->j;
921   garray = c->garray;
922 
923   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
924     rbuf_i = rbuf[i];
925     rbuf_0 = rbuf_i[0];
926     ct += rbuf_0;
927     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
928   }
929 
930   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
931   else max1 = 1;
932   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
933   if (nrqr) {
934     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
935     ++no_malloc;
936   }
937   PetscCall(PetscBTCreate(m, &xtable));
938   PetscCall(PetscArrayzero(isz1, nrqr));
939 
940   ct3 = 0;
941   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
942     rbuf_i = rbuf[i];
943     rbuf_0 = rbuf_i[0];
944     ct1    = 2 * rbuf_0 + 1;
945     ct2    = ct1;
946     ct3 += ct1;
947     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
948       PetscCall(PetscBTMemzero(m, xtable));
949       oct2 = ct2;
950       kmax = rbuf_i[2 * j];
951       for (k = 0; k < kmax; k++, ct1++) {
952         row = rbuf_i[ct1];
953         if (!PetscBTLookupSet(xtable, row)) {
954           if (!(ct3 < mem_estimate)) {
955             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
956             PetscCall(PetscMalloc1(new_estimate, &tmp));
957             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
958             PetscCall(PetscFree(xdata[0]));
959             xdata[0]     = tmp;
960             mem_estimate = new_estimate;
961             ++no_malloc;
962             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
963           }
964           xdata[i][ct2++] = row;
965           ct3++;
966         }
967       }
968       for (k = oct2, max2 = ct2; k < max2; k++) {
969         row   = xdata[i][k] - rstart;
970         start = ai[row];
971         end   = ai[row + 1];
972         for (l = start; l < end; l++) {
973           val = aj[l] + cstart;
974           if (!PetscBTLookupSet(xtable, val)) {
975             if (!(ct3 < mem_estimate)) {
976               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
977               PetscCall(PetscMalloc1(new_estimate, &tmp));
978               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
979               PetscCall(PetscFree(xdata[0]));
980               xdata[0]     = tmp;
981               mem_estimate = new_estimate;
982               ++no_malloc;
983               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
984             }
985             xdata[i][ct2++] = val;
986             ct3++;
987           }
988         }
989         start = bi[row];
990         end   = bi[row + 1];
991         for (l = start; l < end; l++) {
992           val = garray[bj[l]];
993           if (!PetscBTLookupSet(xtable, val)) {
994             if (!(ct3 < mem_estimate)) {
995               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
996               PetscCall(PetscMalloc1(new_estimate, &tmp));
997               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
998               PetscCall(PetscFree(xdata[0]));
999               xdata[0]     = tmp;
1000               mem_estimate = new_estimate;
1001               ++no_malloc;
1002               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1003             }
1004             xdata[i][ct2++] = val;
1005             ct3++;
1006           }
1007         }
1008       }
1009       /* Update the header*/
1010       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1011       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1012     }
1013     xdata[i][0] = rbuf_0;
1014     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1015     isz1[i] = ct2; /* size of each message */
1016   }
1017   PetscCall(PetscBTDestroy(&xtable));
1018   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1019   PetscFunctionReturn(0);
1020 }
1021 /* -------------------------------------------------------------------------*/
1022 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1023 /*
1024     Every processor gets the entire matrix
1025 */
1026 PetscErrorCode        MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) {
1027          Mat         B;
1028          Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1029          Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1030          PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1031          PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1032          PetscInt    m, *b_sendj, *garray   = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1033 
1034          PetscFunctionBegin;
1035          PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1036          PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1037          if (scall == MAT_INITIAL_MATRIX) {
1038            /* ----------------------------------------------------------------
1039          Tell every processor the number of nonzeros per row
1040     */
1041     PetscCall(PetscMalloc1(A->rmap->N, &lens));
1042     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1043     PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1044     for (i = 0; i < size; i++) {
1045              recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1046              displs[i]     = A->rmap->range[i];
1047     }
1048     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1049     /* ---------------------------------------------------------------
1050          Create the sequential matrix of the same type as the local block diagonal
1051     */
1052     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1053     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1054     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1055     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1056     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1057     PetscCall(PetscCalloc1(2, Bin));
1058     **Bin = B;
1059     b     = (Mat_SeqAIJ *)B->data;
1060 
1061     /*--------------------------------------------------------------------
1062        Copy my part of matrix column indices over
1063     */
1064     sendcount  = ad->nz + bd->nz;
1065     jsendbuf   = b->j + b->i[rstarts[rank]];
1066     a_jsendbuf = ad->j;
1067     b_jsendbuf = bd->j;
1068     n          = A->rmap->rend - A->rmap->rstart;
1069     cnt        = 0;
1070     for (i = 0; i < n; i++) {
1071              /* put in lower diagonal portion */
1072       m = bd->i[i + 1] - bd->i[i];
1073       while (m > 0) {
1074                /* is it above diagonal (in bd (compressed) numbering) */
1075         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1076         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1077         m--;
1078       }
1079 
1080       /* put in diagonal portion */
1081       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1082 
1083       /* put in upper diagonal portion */
1084       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1085     }
1086     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1087 
1088     /*--------------------------------------------------------------------
1089        Gather all column indices to all processors
1090     */
1091     for (i = 0; i < size; i++) {
1092              recvcounts[i] = 0;
1093              for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1094     }
1095            displs[0] = 0;
1096     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1097     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1098     /*--------------------------------------------------------------------
1099         Assemble the matrix into useable form (note numerical values not yet set)
1100     */
1101     /* set the b->ilen (length of each row) values */
1102     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1103     /* set the b->i indices */
1104     b->i[0] = 0;
1105     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1106     PetscCall(PetscFree(lens));
1107     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1108     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1109 
1110   } else {
1111            B = **Bin;
1112            b = (Mat_SeqAIJ *)B->data;
1113   }
1114 
1115          /*--------------------------------------------------------------------
1116        Copy my part of matrix numerical values into the values location
1117   */
1118          if (flag == MAT_GET_VALUES) {
1119            const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1120            MatScalar         *sendbuf, *recvbuf;
1121 
1122            PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1123            PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1124            sendcount = ad->nz + bd->nz;
1125            sendbuf   = b->a + b->i[rstarts[rank]];
1126            a_sendbuf = ada;
1127            b_sendbuf = bda;
1128            b_sendj   = bd->j;
1129            n         = A->rmap->rend - A->rmap->rstart;
1130            cnt       = 0;
1131            for (i = 0; i < n; i++) {
1132              /* put in lower diagonal portion */
1133       m = bd->i[i + 1] - bd->i[i];
1134       while (m > 0) {
1135                /* is it above diagonal (in bd (compressed) numbering) */
1136         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1137         sendbuf[cnt++] = *b_sendbuf++;
1138         m--;
1139         b_sendj++;
1140       }
1141 
1142       /* put in diagonal portion */
1143       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1144 
1145       /* put in upper diagonal portion */
1146       while (m-- > 0) {
1147                sendbuf[cnt++] = *b_sendbuf++;
1148                b_sendj++;
1149       }
1150     }
1151            PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1152 
1153            /* -----------------------------------------------------------------
1154        Gather all numerical values to all processors
1155     */
1156            if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1157     for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1158     displs[0] = 0;
1159            for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1160     recvbuf = b->a;
1161            PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A)));
1162            PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1163            PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1164   } /* endof (flag == MAT_GET_VALUES) */
1165          PetscCall(PetscFree2(recvcounts, displs));
1166 
1167          PetscCall(MatPropagateSymmetryOptions(A, B));
1168          PetscFunctionReturn(0);
1169 }
1170 
1171 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) {
1172   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1173   Mat             submat, A = c->A, B = c->B;
1174   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1175   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1176   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1177   const PetscInt *icol, *irow;
1178   PetscInt        nrow, ncol, start;
1179   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1180   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1181   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1182   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1183   PetscInt       *lens, rmax, ncols, *cols, Crow;
1184 #if defined(PETSC_USE_CTABLE)
1185   PetscTable cmap, rmap;
1186   PetscInt  *cmap_loc, *rmap_loc;
1187 #else
1188   PetscInt *cmap, *rmap;
1189 #endif
1190   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1191   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1192   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1193   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1194   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1195   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1196   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1197   MPI_Comm      comm;
1198   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1199   PetscMPIInt  *onodes1, *olengths1, idex, end;
1200   Mat_SubSppt  *smatis1;
1201   PetscBool     isrowsorted, iscolsorted;
1202 
1203   PetscFunctionBegin;
1204   PetscValidLogicalCollectiveInt(C, ismax, 2);
1205   PetscValidLogicalCollectiveEnum(C, scall, 5);
1206   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1207   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1208   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1209   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1210   size = c->size;
1211   rank = c->rank;
1212 
1213   PetscCall(ISSorted(iscol[0], &iscolsorted));
1214   PetscCall(ISSorted(isrow[0], &isrowsorted));
1215   PetscCall(ISGetIndices(isrow[0], &irow));
1216   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1217   if (allcolumns) {
1218     icol = NULL;
1219     ncol = C->cmap->N;
1220   } else {
1221     PetscCall(ISGetIndices(iscol[0], &icol));
1222     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1223   }
1224 
1225   if (scall == MAT_INITIAL_MATRIX) {
1226     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1227 
1228     /* Get some new tags to keep the communication clean */
1229     tag1 = ((PetscObject)C)->tag;
1230     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1231     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1232 
1233     /* evaluate communication - mesg to who, length of mesg, and buffer space
1234      required. Based on this, buffers are allocated, and data copied into them */
1235     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1236     PetscCall(PetscMalloc1(nrow, &row2proc));
1237 
1238     /* w1[proc] = num of rows owned by proc -- to be requested */
1239     proc = 0;
1240     nrqs = 0; /* num of outgoing messages */
1241     for (j = 0; j < nrow; j++) {
1242       row = irow[j];
1243       if (!isrowsorted) proc = 0;
1244       while (row >= C->rmap->range[proc + 1]) proc++;
1245       w1[proc]++;
1246       row2proc[j] = proc; /* map row index to proc */
1247 
1248       if (proc != rank && !w2[proc]) {
1249         w2[proc] = 1;
1250         nrqs++;
1251       }
1252     }
1253     w1[rank] = 0; /* rows owned by self will not be requested */
1254 
1255     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1256     for (proc = 0, j = 0; proc < size; proc++) {
1257       if (w1[proc]) pa[j++] = proc;
1258     }
1259 
1260     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1261     msz = 0; /* total mesg length (for all procs) */
1262     for (i = 0; i < nrqs; i++) {
1263       proc = pa[i];
1264       w1[proc] += 3;
1265       msz += w1[proc];
1266     }
1267     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
1268 
1269     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1270     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1271     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1272 
1273     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1274        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1275     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1276 
1277     /* Now post the Irecvs corresponding to these messages */
1278     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1279 
1280     PetscCall(PetscFree(onodes1));
1281     PetscCall(PetscFree(olengths1));
1282 
1283     /* Allocate Memory for outgoing messages */
1284     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1285     PetscCall(PetscArrayzero(sbuf1, size));
1286     PetscCall(PetscArrayzero(ptr, size));
1287 
1288     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1289     iptr = tmp;
1290     for (i = 0; i < nrqs; i++) {
1291       proc        = pa[i];
1292       sbuf1[proc] = iptr;
1293       iptr += w1[proc];
1294     }
1295 
1296     /* Form the outgoing messages */
1297     /* Initialize the header space */
1298     for (i = 0; i < nrqs; i++) {
1299       proc = pa[i];
1300       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1301       ptr[proc] = sbuf1[proc] + 3;
1302     }
1303 
1304     /* Parse the isrow and copy data into outbuf */
1305     PetscCall(PetscArrayzero(ctr, size));
1306     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1307       proc = row2proc[j];
1308       if (proc != rank) { /* copy to the outgoing buf*/
1309         *ptr[proc] = irow[j];
1310         ctr[proc]++;
1311         ptr[proc]++;
1312       }
1313     }
1314 
1315     /* Update the headers for the current IS */
1316     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1317       if ((ctr_j = ctr[j])) {
1318         sbuf1_j            = sbuf1[j];
1319         k                  = ++sbuf1_j[0];
1320         sbuf1_j[2 * k]     = ctr_j;
1321         sbuf1_j[2 * k - 1] = 0;
1322       }
1323     }
1324 
1325     /* Now post the sends */
1326     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1327     for (i = 0; i < nrqs; ++i) {
1328       proc = pa[i];
1329       PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1330     }
1331 
1332     /* Post Receives to capture the buffer size */
1333     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1334     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1335 
1336     if (nrqs) rbuf2[0] = tmp + msz;
1337     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1338 
1339     for (i = 0; i < nrqs; ++i) {
1340       proc = pa[i];
1341       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1342     }
1343 
1344     PetscCall(PetscFree2(w1, w2));
1345 
1346     /* Send to other procs the buf size they should allocate */
1347     /* Receive messages*/
1348     PetscCall(PetscMalloc1(nrqr, &r_status1));
1349     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1350 
1351     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1352     for (i = 0; i < nrqr; ++i) {
1353       req_size[i] = 0;
1354       rbuf1_i     = rbuf1[i];
1355       start       = 2 * rbuf1_i[0] + 1;
1356       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1357       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1358       sbuf2_i = sbuf2[i];
1359       for (j = start; j < end; j++) {
1360         k          = rbuf1_i[j] - rstart;
1361         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1362         sbuf2_i[j] = ncols;
1363         req_size[i] += ncols;
1364       }
1365       req_source1[i] = r_status1[i].MPI_SOURCE;
1366 
1367       /* form the header */
1368       sbuf2_i[0] = req_size[i];
1369       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1370 
1371       PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1372     }
1373 
1374     PetscCall(PetscFree(r_status1));
1375     PetscCall(PetscFree(r_waits1));
1376 
1377     /* rbuf2 is received, Post recv column indices a->j */
1378     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1379 
1380     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1381     for (i = 0; i < nrqs; ++i) {
1382       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1383       req_source2[i] = r_status2[i].MPI_SOURCE;
1384       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1385     }
1386 
1387     /* Wait on sends1 and sends2 */
1388     PetscCall(PetscMalloc1(nrqs, &s_status1));
1389     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1390     PetscCall(PetscFree(s_waits1));
1391     PetscCall(PetscFree(s_status1));
1392 
1393     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1394     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1395 
1396     /* Now allocate sending buffers for a->j, and send them off */
1397     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1398     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1399     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1400     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1401 
1402     for (i = 0; i < nrqr; i++) { /* for each requested message */
1403       rbuf1_i   = rbuf1[i];
1404       sbuf_aj_i = sbuf_aj[i];
1405       ct1       = 2 * rbuf1_i[0] + 1;
1406       ct2       = 0;
1407       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1408 
1409       kmax = rbuf1[i][2];
1410       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1411         row    = rbuf1_i[ct1] - rstart;
1412         nzA    = ai[row + 1] - ai[row];
1413         nzB    = bi[row + 1] - bi[row];
1414         ncols  = nzA + nzB;
1415         cworkA = aj + ai[row];
1416         cworkB = bj + bi[row];
1417 
1418         /* load the column indices for this row into cols*/
1419         cols = sbuf_aj_i + ct2;
1420 
1421         lwrite = 0;
1422         for (l = 0; l < nzB; l++) {
1423           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1424         }
1425         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1426         for (l = 0; l < nzB; l++) {
1427           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1428         }
1429 
1430         ct2 += ncols;
1431       }
1432       PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1433     }
1434 
1435     /* create column map (cmap): global col of C -> local col of submat */
1436 #if defined(PETSC_USE_CTABLE)
1437     if (!allcolumns) {
1438       PetscCall(PetscTableCreate(ncol, C->cmap->N, &cmap));
1439       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1440       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1441         if (icol[j] >= cstart && icol[j] < cend) {
1442           cmap_loc[icol[j] - cstart] = j + 1;
1443         } else { /* use PetscTable for non-local col indices */
1444           PetscCall(PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES));
1445         }
1446       }
1447     } else {
1448       cmap     = NULL;
1449       cmap_loc = NULL;
1450     }
1451     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1452 #else
1453     if (!allcolumns) {
1454       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1455       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1456     } else {
1457       cmap = NULL;
1458     }
1459 #endif
1460 
1461     /* Create lens for MatSeqAIJSetPreallocation() */
1462     PetscCall(PetscCalloc1(nrow, &lens));
1463 
1464     /* Compute lens from local part of C */
1465     for (j = 0; j < nrow; j++) {
1466       row  = irow[j];
1467       proc = row2proc[j];
1468       if (proc == rank) {
1469         /* diagonal part A = c->A */
1470         ncols = ai[row - rstart + 1] - ai[row - rstart];
1471         cols  = aj + ai[row - rstart];
1472         if (!allcolumns) {
1473           for (k = 0; k < ncols; k++) {
1474 #if defined(PETSC_USE_CTABLE)
1475             tcol = cmap_loc[cols[k]];
1476 #else
1477             tcol = cmap[cols[k] + cstart];
1478 #endif
1479             if (tcol) lens[j]++;
1480           }
1481         } else { /* allcolumns */
1482           lens[j] = ncols;
1483         }
1484 
1485         /* off-diagonal part B = c->B */
1486         ncols = bi[row - rstart + 1] - bi[row - rstart];
1487         cols  = bj + bi[row - rstart];
1488         if (!allcolumns) {
1489           for (k = 0; k < ncols; k++) {
1490 #if defined(PETSC_USE_CTABLE)
1491             PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol));
1492 #else
1493             tcol = cmap[bmap[cols[k]]];
1494 #endif
1495             if (tcol) lens[j]++;
1496           }
1497         } else { /* allcolumns */
1498           lens[j] += ncols;
1499         }
1500       }
1501     }
1502 
1503     /* Create row map (rmap): global row of C -> local row of submat */
1504 #if defined(PETSC_USE_CTABLE)
1505     PetscCall(PetscTableCreate(nrow, C->rmap->N, &rmap));
1506     for (j = 0; j < nrow; j++) {
1507       row  = irow[j];
1508       proc = row2proc[j];
1509       if (proc == rank) { /* a local row */
1510         rmap_loc[row - rstart] = j;
1511       } else {
1512         PetscCall(PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES));
1513       }
1514     }
1515 #else
1516     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1517     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1518 #endif
1519 
1520     /* Update lens from offproc data */
1521     /* recv a->j is done */
1522     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1523     for (i = 0; i < nrqs; i++) {
1524       proc    = pa[i];
1525       sbuf1_i = sbuf1[proc];
1526       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1527       ct1     = 2 + 1;
1528       ct2     = 0;
1529       rbuf2_i = rbuf2[i]; /* received length of C->j */
1530       rbuf3_i = rbuf3[i]; /* received C->j */
1531 
1532       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1533       max1 = sbuf1_i[2];
1534       for (k = 0; k < max1; k++, ct1++) {
1535 #if defined(PETSC_USE_CTABLE)
1536         PetscCall(PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row));
1537         row--;
1538         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1539 #else
1540         row = rmap[sbuf1_i[ct1]];      /* the row index in submat */
1541 #endif
1542         /* Now, store row index of submat in sbuf1_i[ct1] */
1543         sbuf1_i[ct1] = row;
1544 
1545         nnz = rbuf2_i[ct1];
1546         if (!allcolumns) {
1547           for (l = 0; l < nnz; l++, ct2++) {
1548 #if defined(PETSC_USE_CTABLE)
1549             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1550               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1551             } else {
1552               PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol));
1553             }
1554 #else
1555             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1556 #endif
1557             if (tcol) lens[row]++;
1558           }
1559         } else { /* allcolumns */
1560           lens[row] += nnz;
1561         }
1562       }
1563     }
1564     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1565     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1566 
1567     /* Create the submatrices */
1568     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1569     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1570 
1571     PetscCall(ISGetBlockSize(isrow[0], &i));
1572     PetscCall(ISGetBlockSize(iscol[0], &j));
1573     PetscCall(MatSetBlockSizes(submat, i, j));
1574     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1575     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1576 
1577     /* create struct Mat_SubSppt and attached it to submat */
1578     PetscCall(PetscNew(&smatis1));
1579     subc            = (Mat_SeqAIJ *)submat->data;
1580     subc->submatis1 = smatis1;
1581 
1582     smatis1->id          = 0;
1583     smatis1->nrqs        = nrqs;
1584     smatis1->nrqr        = nrqr;
1585     smatis1->rbuf1       = rbuf1;
1586     smatis1->rbuf2       = rbuf2;
1587     smatis1->rbuf3       = rbuf3;
1588     smatis1->sbuf2       = sbuf2;
1589     smatis1->req_source2 = req_source2;
1590 
1591     smatis1->sbuf1 = sbuf1;
1592     smatis1->ptr   = ptr;
1593     smatis1->tmp   = tmp;
1594     smatis1->ctr   = ctr;
1595 
1596     smatis1->pa          = pa;
1597     smatis1->req_size    = req_size;
1598     smatis1->req_source1 = req_source1;
1599 
1600     smatis1->allcolumns = allcolumns;
1601     smatis1->singleis   = PETSC_TRUE;
1602     smatis1->row2proc   = row2proc;
1603     smatis1->rmap       = rmap;
1604     smatis1->cmap       = cmap;
1605 #if defined(PETSC_USE_CTABLE)
1606     smatis1->rmap_loc = rmap_loc;
1607     smatis1->cmap_loc = cmap_loc;
1608 #endif
1609 
1610     smatis1->destroy     = submat->ops->destroy;
1611     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1612     submat->factortype   = C->factortype;
1613 
1614     /* compute rmax */
1615     rmax = 0;
1616     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1617 
1618   } else { /* scall == MAT_REUSE_MATRIX */
1619     submat = submats[0];
1620     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1621 
1622     subc        = (Mat_SeqAIJ *)submat->data;
1623     rmax        = subc->rmax;
1624     smatis1     = subc->submatis1;
1625     nrqs        = smatis1->nrqs;
1626     nrqr        = smatis1->nrqr;
1627     rbuf1       = smatis1->rbuf1;
1628     rbuf2       = smatis1->rbuf2;
1629     rbuf3       = smatis1->rbuf3;
1630     req_source2 = smatis1->req_source2;
1631 
1632     sbuf1 = smatis1->sbuf1;
1633     sbuf2 = smatis1->sbuf2;
1634     ptr   = smatis1->ptr;
1635     tmp   = smatis1->tmp;
1636     ctr   = smatis1->ctr;
1637 
1638     pa          = smatis1->pa;
1639     req_size    = smatis1->req_size;
1640     req_source1 = smatis1->req_source1;
1641 
1642     allcolumns = smatis1->allcolumns;
1643     row2proc   = smatis1->row2proc;
1644     rmap       = smatis1->rmap;
1645     cmap       = smatis1->cmap;
1646 #if defined(PETSC_USE_CTABLE)
1647     rmap_loc = smatis1->rmap_loc;
1648     cmap_loc = smatis1->cmap_loc;
1649 #endif
1650   }
1651 
1652   /* Post recv matrix values */
1653   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1654   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1655   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1656   for (i = 0; i < nrqs; ++i) {
1657     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1658     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1659   }
1660 
1661   /* Allocate sending buffers for a->a, and send them off */
1662   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1663   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1664   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1665   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1666 
1667   for (i = 0; i < nrqr; i++) {
1668     rbuf1_i   = rbuf1[i];
1669     sbuf_aa_i = sbuf_aa[i];
1670     ct1       = 2 * rbuf1_i[0] + 1;
1671     ct2       = 0;
1672     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1673 
1674     kmax = rbuf1_i[2];
1675     for (k = 0; k < kmax; k++, ct1++) {
1676       row    = rbuf1_i[ct1] - rstart;
1677       nzA    = ai[row + 1] - ai[row];
1678       nzB    = bi[row + 1] - bi[row];
1679       ncols  = nzA + nzB;
1680       cworkB = bj + bi[row];
1681       vworkA = a_a + ai[row];
1682       vworkB = b_a + bi[row];
1683 
1684       /* load the column values for this row into vals*/
1685       vals = sbuf_aa_i + ct2;
1686 
1687       lwrite = 0;
1688       for (l = 0; l < nzB; l++) {
1689         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1690       }
1691       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1692       for (l = 0; l < nzB; l++) {
1693         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1694       }
1695 
1696       ct2 += ncols;
1697     }
1698     PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1699   }
1700 
1701   /* Assemble submat */
1702   /* First assemble the local rows */
1703   for (j = 0; j < nrow; j++) {
1704     row  = irow[j];
1705     proc = row2proc[j];
1706     if (proc == rank) {
1707       Crow = row - rstart; /* local row index of C */
1708 #if defined(PETSC_USE_CTABLE)
1709       row = rmap_loc[Crow]; /* row index of submat */
1710 #else
1711       row = rmap[row];
1712 #endif
1713 
1714       if (allcolumns) {
1715         /* diagonal part A = c->A */
1716         ncols = ai[Crow + 1] - ai[Crow];
1717         cols  = aj + ai[Crow];
1718         vals  = a_a + ai[Crow];
1719         i     = 0;
1720         for (k = 0; k < ncols; k++) {
1721           subcols[i]   = cols[k] + cstart;
1722           subvals[i++] = vals[k];
1723         }
1724 
1725         /* off-diagonal part B = c->B */
1726         ncols = bi[Crow + 1] - bi[Crow];
1727         cols  = bj + bi[Crow];
1728         vals  = b_a + bi[Crow];
1729         for (k = 0; k < ncols; k++) {
1730           subcols[i]   = bmap[cols[k]];
1731           subvals[i++] = vals[k];
1732         }
1733 
1734         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1735 
1736       } else { /* !allcolumns */
1737 #if defined(PETSC_USE_CTABLE)
1738         /* diagonal part A = c->A */
1739         ncols = ai[Crow + 1] - ai[Crow];
1740         cols  = aj + ai[Crow];
1741         vals  = a_a + ai[Crow];
1742         i     = 0;
1743         for (k = 0; k < ncols; k++) {
1744           tcol = cmap_loc[cols[k]];
1745           if (tcol) {
1746             subcols[i]   = --tcol;
1747             subvals[i++] = vals[k];
1748           }
1749         }
1750 
1751         /* off-diagonal part B = c->B */
1752         ncols = bi[Crow + 1] - bi[Crow];
1753         cols  = bj + bi[Crow];
1754         vals  = b_a + bi[Crow];
1755         for (k = 0; k < ncols; k++) {
1756           PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol));
1757           if (tcol) {
1758             subcols[i]   = --tcol;
1759             subvals[i++] = vals[k];
1760           }
1761         }
1762 #else
1763         /* diagonal part A = c->A */
1764         ncols = ai[Crow + 1] - ai[Crow];
1765         cols  = aj + ai[Crow];
1766         vals  = a_a + ai[Crow];
1767         i     = 0;
1768         for (k = 0; k < ncols; k++) {
1769           tcol = cmap[cols[k] + cstart];
1770           if (tcol) {
1771             subcols[i]   = --tcol;
1772             subvals[i++] = vals[k];
1773           }
1774         }
1775 
1776         /* off-diagonal part B = c->B */
1777         ncols = bi[Crow + 1] - bi[Crow];
1778         cols  = bj + bi[Crow];
1779         vals  = b_a + bi[Crow];
1780         for (k = 0; k < ncols; k++) {
1781           tcol = cmap[bmap[cols[k]]];
1782           if (tcol) {
1783             subcols[i]   = --tcol;
1784             subvals[i++] = vals[k];
1785           }
1786         }
1787 #endif
1788         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1789       }
1790     }
1791   }
1792 
1793   /* Now assemble the off-proc rows */
1794   for (i = 0; i < nrqs; i++) { /* for each requested message */
1795     /* recv values from other processes */
1796     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1797     proc    = pa[idex];
1798     sbuf1_i = sbuf1[proc];
1799     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1800     ct1     = 2 + 1;
1801     ct2     = 0;           /* count of received C->j */
1802     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1803     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1804     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1805     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1806 
1807     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1808     max1 = sbuf1_i[2];                  /* num of rows */
1809     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1810       row = sbuf1_i[ct1];               /* row index of submat */
1811       if (!allcolumns) {
1812         idex = 0;
1813         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1814           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1815           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1816 #if defined(PETSC_USE_CTABLE)
1817             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1818               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1819             } else {
1820               PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol));
1821             }
1822 #else
1823             tcol = cmap[rbuf3_i[ct2]];
1824 #endif
1825             if (tcol) {
1826               subcols[idex]   = --tcol; /* may not be sorted */
1827               subvals[idex++] = rbuf4_i[ct2];
1828 
1829               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1830                For reuse, we replace received C->j with index that should be inserted to submat */
1831               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1832             }
1833           }
1834           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1835         } else { /* scall == MAT_REUSE_MATRIX */
1836           submat = submats[0];
1837           subc   = (Mat_SeqAIJ *)submat->data;
1838 
1839           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1840           for (l = 0; l < nnz; l++) {
1841             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1842             subvals[idex++] = rbuf4_i[ct2];
1843           }
1844 
1845           bj = subc->j + subc->i[row]; /* sorted column indices */
1846           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1847         }
1848       } else {              /* allcolumns */
1849         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1850         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES));
1851         ct2 += nnz;
1852       }
1853     }
1854   }
1855 
1856   /* sending a->a are done */
1857   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1858   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1859 
1860   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1861   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1862   submats[0] = submat;
1863 
1864   /* Restore the indices */
1865   PetscCall(ISRestoreIndices(isrow[0], &irow));
1866   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1867 
1868   /* Destroy allocated memory */
1869   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1870   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1871   if (sbuf_aa) {
1872     PetscCall(PetscFree(sbuf_aa[0]));
1873     PetscCall(PetscFree(sbuf_aa));
1874   }
1875 
1876   if (scall == MAT_INITIAL_MATRIX) {
1877     PetscCall(PetscFree(lens));
1878     if (sbuf_aj) {
1879       PetscCall(PetscFree(sbuf_aj[0]));
1880       PetscCall(PetscFree(sbuf_aj));
1881     }
1882   }
1883   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1884   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1885   PetscFunctionReturn(0);
1886 }
1887 
1888 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) {
1889   PetscInt  ncol;
1890   PetscBool colflag, allcolumns = PETSC_FALSE;
1891 
1892   PetscFunctionBegin;
1893   /* Allocate memory to hold all the submatrices */
1894   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1895 
1896   /* Check for special case: each processor gets entire matrix columns */
1897   PetscCall(ISIdentity(iscol[0], &colflag));
1898   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1899   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1900 
1901   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1902   PetscFunctionReturn(0);
1903 }
1904 
1905 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) {
1906   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1907   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1908   Mat_SeqAIJ  *subc;
1909   Mat_SubSppt *smat;
1910 
1911   PetscFunctionBegin;
1912   /* Check for special case: each processor has a single IS */
1913   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1914     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1915     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1916     PetscFunctionReturn(0);
1917   }
1918 
1919   /* Collect global wantallmatrix and nstages */
1920   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1921   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1922   if (!nmax) nmax = 1;
1923 
1924   if (scall == MAT_INITIAL_MATRIX) {
1925     /* Collect global wantallmatrix and nstages */
1926     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1927       PetscCall(ISIdentity(*isrow, &rowflag));
1928       PetscCall(ISIdentity(*iscol, &colflag));
1929       PetscCall(ISGetLocalSize(*isrow, &nrow));
1930       PetscCall(ISGetLocalSize(*iscol, &ncol));
1931       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1932         wantallmatrix = PETSC_TRUE;
1933 
1934         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1935       }
1936     }
1937 
1938     /* Determine the number of stages through which submatrices are done
1939        Each stage will extract nmax submatrices.
1940        nmax is determined by the matrix column dimension.
1941        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1942     */
1943     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1944 
1945     in[0] = -1 * (PetscInt)wantallmatrix;
1946     in[1] = nstages;
1947     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1948     wantallmatrix = (PetscBool)(-out[0]);
1949     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1950 
1951   } else { /* MAT_REUSE_MATRIX */
1952     if (ismax) {
1953       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
1954       smat = subc->submatis1;
1955     } else { /* (*submat)[0] is a dummy matrix */
1956       smat = (Mat_SubSppt *)(*submat)[0]->data;
1957     }
1958     if (!smat) {
1959       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1960       wantallmatrix = PETSC_TRUE;
1961     } else if (smat->singleis) {
1962       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1963       PetscFunctionReturn(0);
1964     } else {
1965       nstages = smat->nstages;
1966     }
1967   }
1968 
1969   if (wantallmatrix) {
1970     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
1971     PetscFunctionReturn(0);
1972   }
1973 
1974   /* Allocate memory to hold all the submatrices and dummy submatrices */
1975   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
1976 
1977   for (i = 0, pos = 0; i < nstages; i++) {
1978     if (pos + nmax <= ismax) max_no = nmax;
1979     else if (pos >= ismax) max_no = 0;
1980     else max_no = ismax - pos;
1981 
1982     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos));
1983     if (!max_no) {
1984       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
1985         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
1986         smat->nstages = nstages;
1987       }
1988       pos++; /* advance to next dummy matrix if any */
1989     } else pos += max_no;
1990   }
1991 
1992   if (ismax && scall == MAT_INITIAL_MATRIX) {
1993     /* save nstages for reuse */
1994     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
1995     smat          = subc->submatis1;
1996     smat->nstages = nstages;
1997   }
1998   PetscFunctionReturn(0);
1999 }
2000 
2001 /* -------------------------------------------------------------------------*/
2002 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) {
2003   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2004   Mat              A = c->A;
2005   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2006   const PetscInt **icol, **irow;
2007   PetscInt        *nrow, *ncol, start;
2008   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2009   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2010   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2011   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2012   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2013 #if defined(PETSC_USE_CTABLE)
2014   PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i;
2015 #else
2016   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2017 #endif
2018   const PetscInt *irow_i;
2019   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2020   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2021   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2022   MPI_Comm        comm;
2023   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2024   PetscMPIInt    *onodes1, *olengths1, end;
2025   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2026   Mat_SubSppt    *smat_i;
2027   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2028   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2029 
2030   PetscFunctionBegin;
2031   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2032   size = c->size;
2033   rank = c->rank;
2034 
2035   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2036   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2037 
2038   for (i = 0; i < ismax; i++) {
2039     PetscCall(ISSorted(iscol[i], &issorted[i]));
2040     if (!issorted[i]) iscsorted = issorted[i];
2041 
2042     PetscCall(ISSorted(isrow[i], &issorted[i]));
2043 
2044     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2045     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2046 
2047     /* Check for special case: allcolumn */
2048     PetscCall(ISIdentity(iscol[i], &colflag));
2049     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2050     if (colflag && ncol[i] == C->cmap->N) {
2051       allcolumns[i] = PETSC_TRUE;
2052       icol[i]       = NULL;
2053     } else {
2054       allcolumns[i] = PETSC_FALSE;
2055       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2056     }
2057   }
2058 
2059   if (scall == MAT_REUSE_MATRIX) {
2060     /* Assumes new rows are same length as the old rows */
2061     for (i = 0; i < ismax; i++) {
2062       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2063       subc = (Mat_SeqAIJ *)submats[i]->data;
2064       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2065 
2066       /* Initial matrix as if empty */
2067       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2068 
2069       smat_i = subc->submatis1;
2070 
2071       nrqs        = smat_i->nrqs;
2072       nrqr        = smat_i->nrqr;
2073       rbuf1       = smat_i->rbuf1;
2074       rbuf2       = smat_i->rbuf2;
2075       rbuf3       = smat_i->rbuf3;
2076       req_source2 = smat_i->req_source2;
2077 
2078       sbuf1 = smat_i->sbuf1;
2079       sbuf2 = smat_i->sbuf2;
2080       ptr   = smat_i->ptr;
2081       tmp   = smat_i->tmp;
2082       ctr   = smat_i->ctr;
2083 
2084       pa          = smat_i->pa;
2085       req_size    = smat_i->req_size;
2086       req_source1 = smat_i->req_source1;
2087 
2088       allcolumns[i] = smat_i->allcolumns;
2089       row2proc[i]   = smat_i->row2proc;
2090       rmap[i]       = smat_i->rmap;
2091       cmap[i]       = smat_i->cmap;
2092     }
2093 
2094     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2095       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2096       smat_i = (Mat_SubSppt *)submats[0]->data;
2097 
2098       nrqs        = smat_i->nrqs;
2099       nrqr        = smat_i->nrqr;
2100       rbuf1       = smat_i->rbuf1;
2101       rbuf2       = smat_i->rbuf2;
2102       rbuf3       = smat_i->rbuf3;
2103       req_source2 = smat_i->req_source2;
2104 
2105       sbuf1 = smat_i->sbuf1;
2106       sbuf2 = smat_i->sbuf2;
2107       ptr   = smat_i->ptr;
2108       tmp   = smat_i->tmp;
2109       ctr   = smat_i->ctr;
2110 
2111       pa          = smat_i->pa;
2112       req_size    = smat_i->req_size;
2113       req_source1 = smat_i->req_source1;
2114 
2115       allcolumns[0] = PETSC_FALSE;
2116     }
2117   } else { /* scall == MAT_INITIAL_MATRIX */
2118     /* Get some new tags to keep the communication clean */
2119     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2120     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2121 
2122     /* evaluate communication - mesg to who, length of mesg, and buffer space
2123      required. Based on this, buffers are allocated, and data copied into them*/
2124     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2125 
2126     for (i = 0; i < ismax; i++) {
2127       jmax   = nrow[i];
2128       irow_i = irow[i];
2129 
2130       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2131       row2proc[i] = row2proc_i;
2132 
2133       if (issorted[i]) proc = 0;
2134       for (j = 0; j < jmax; j++) {
2135         if (!issorted[i]) proc = 0;
2136         row = irow_i[j];
2137         while (row >= C->rmap->range[proc + 1]) proc++;
2138         w4[proc]++;
2139         row2proc_i[j] = proc; /* map row index to proc */
2140       }
2141       for (j = 0; j < size; j++) {
2142         if (w4[j]) {
2143           w1[j] += w4[j];
2144           w3[j]++;
2145           w4[j] = 0;
2146         }
2147       }
2148     }
2149 
2150     nrqs     = 0; /* no of outgoing messages */
2151     msz      = 0; /* total mesg length (for all procs) */
2152     w1[rank] = 0; /* no mesg sent to self */
2153     w3[rank] = 0;
2154     for (i = 0; i < size; i++) {
2155       if (w1[i]) {
2156         w2[i] = 1;
2157         nrqs++;
2158       } /* there exists a message to proc i */
2159     }
2160     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2161     for (i = 0, j = 0; i < size; i++) {
2162       if (w1[i]) {
2163         pa[j] = i;
2164         j++;
2165       }
2166     }
2167 
2168     /* Each message would have a header = 1 + 2*(no of IS) + data */
2169     for (i = 0; i < nrqs; i++) {
2170       j = pa[i];
2171       w1[j] += w2[j] + 2 * w3[j];
2172       msz += w1[j];
2173     }
2174     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
2175 
2176     /* Determine the number of messages to expect, their lengths, from from-ids */
2177     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2178     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2179 
2180     /* Now post the Irecvs corresponding to these messages */
2181     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2182     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2183 
2184     /* Allocate Memory for outgoing messages */
2185     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2186     PetscCall(PetscArrayzero(sbuf1, size));
2187     PetscCall(PetscArrayzero(ptr, size));
2188 
2189     {
2190       PetscInt *iptr = tmp;
2191       k              = 0;
2192       for (i = 0; i < nrqs; i++) {
2193         j = pa[i];
2194         iptr += k;
2195         sbuf1[j] = iptr;
2196         k        = w1[j];
2197       }
2198     }
2199 
2200     /* Form the outgoing messages. Initialize the header space */
2201     for (i = 0; i < nrqs; i++) {
2202       j           = pa[i];
2203       sbuf1[j][0] = 0;
2204       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2205       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2206     }
2207 
2208     /* Parse the isrow and copy data into outbuf */
2209     for (i = 0; i < ismax; i++) {
2210       row2proc_i = row2proc[i];
2211       PetscCall(PetscArrayzero(ctr, size));
2212       irow_i = irow[i];
2213       jmax   = nrow[i];
2214       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2215         proc = row2proc_i[j];
2216         if (proc != rank) { /* copy to the outgoing buf*/
2217           ctr[proc]++;
2218           *ptr[proc] = irow_i[j];
2219           ptr[proc]++;
2220         }
2221       }
2222       /* Update the headers for the current IS */
2223       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2224         if ((ctr_j = ctr[j])) {
2225           sbuf1_j            = sbuf1[j];
2226           k                  = ++sbuf1_j[0];
2227           sbuf1_j[2 * k]     = ctr_j;
2228           sbuf1_j[2 * k - 1] = i;
2229         }
2230       }
2231     }
2232 
2233     /*  Now  post the sends */
2234     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2235     for (i = 0; i < nrqs; ++i) {
2236       j = pa[i];
2237       PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2238     }
2239 
2240     /* Post Receives to capture the buffer size */
2241     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2242     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2243     if (nrqs) rbuf2[0] = tmp + msz;
2244     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2245     for (i = 0; i < nrqs; ++i) {
2246       j = pa[i];
2247       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2248     }
2249 
2250     /* Send to other procs the buf size they should allocate */
2251     /* Receive messages*/
2252     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2253     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2254     {
2255       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2256       PetscInt *sbuf2_i;
2257 
2258       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2259       for (i = 0; i < nrqr; ++i) {
2260         req_size[i] = 0;
2261         rbuf1_i     = rbuf1[i];
2262         start       = 2 * rbuf1_i[0] + 1;
2263         end         = olengths1[i];
2264         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2265         sbuf2_i = sbuf2[i];
2266         for (j = start; j < end; j++) {
2267           id         = rbuf1_i[j] - rstart;
2268           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2269           sbuf2_i[j] = ncols;
2270           req_size[i] += ncols;
2271         }
2272         req_source1[i] = onodes1[i];
2273         /* form the header */
2274         sbuf2_i[0]     = req_size[i];
2275         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2276 
2277         PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2278       }
2279     }
2280 
2281     PetscCall(PetscFree(onodes1));
2282     PetscCall(PetscFree(olengths1));
2283     PetscCall(PetscFree(r_waits1));
2284     PetscCall(PetscFree4(w1, w2, w3, w4));
2285 
2286     /* Receive messages*/
2287     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2288     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2289     for (i = 0; i < nrqs; ++i) {
2290       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2291       req_source2[i] = pa[i];
2292       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2293     }
2294     PetscCall(PetscFree(r_waits2));
2295 
2296     /* Wait on sends1 and sends2 */
2297     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2298     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2299     PetscCall(PetscFree(s_waits1));
2300     PetscCall(PetscFree(s_waits2));
2301 
2302     /* Now allocate sending buffers for a->j, and send them off */
2303     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2304     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2305     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2306     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2307 
2308     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2309     {
2310       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2311       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2312       PetscInt  cend = C->cmap->rend;
2313       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2314 
2315       for (i = 0; i < nrqr; i++) {
2316         rbuf1_i   = rbuf1[i];
2317         sbuf_aj_i = sbuf_aj[i];
2318         ct1       = 2 * rbuf1_i[0] + 1;
2319         ct2       = 0;
2320         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2321           kmax = rbuf1[i][2 * j];
2322           for (k = 0; k < kmax; k++, ct1++) {
2323             row    = rbuf1_i[ct1] - rstart;
2324             nzA    = a_i[row + 1] - a_i[row];
2325             nzB    = b_i[row + 1] - b_i[row];
2326             ncols  = nzA + nzB;
2327             cworkA = a_j + a_i[row];
2328             cworkB = b_j + b_i[row];
2329 
2330             /* load the column indices for this row into cols */
2331             cols = sbuf_aj_i + ct2;
2332 
2333             lwrite = 0;
2334             for (l = 0; l < nzB; l++) {
2335               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2336             }
2337             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2338             for (l = 0; l < nzB; l++) {
2339               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2340             }
2341 
2342             ct2 += ncols;
2343           }
2344         }
2345         PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2346       }
2347     }
2348 
2349     /* create col map: global col of C -> local col of submatrices */
2350     {
2351       const PetscInt *icol_i;
2352 #if defined(PETSC_USE_CTABLE)
2353       for (i = 0; i < ismax; i++) {
2354         if (!allcolumns[i]) {
2355           PetscCall(PetscTableCreate(ncol[i], C->cmap->N, &cmap[i]));
2356 
2357           jmax   = ncol[i];
2358           icol_i = icol[i];
2359           cmap_i = cmap[i];
2360           for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES));
2361         } else cmap[i] = NULL;
2362       }
2363 #else
2364       for (i = 0; i < ismax; i++) {
2365         if (!allcolumns[i]) {
2366           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2367           jmax   = ncol[i];
2368           icol_i = icol[i];
2369           cmap_i = cmap[i];
2370           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2371         } else cmap[i] = NULL;
2372       }
2373 #endif
2374     }
2375 
2376     /* Create lens which is required for MatCreate... */
2377     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2378     PetscCall(PetscMalloc1(ismax, &lens));
2379 
2380     if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2381     for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];
2382 
2383     /* Update lens from local data */
2384     for (i = 0; i < ismax; i++) {
2385       row2proc_i = row2proc[i];
2386       jmax       = nrow[i];
2387       if (!allcolumns[i]) cmap_i = cmap[i];
2388       irow_i = irow[i];
2389       lens_i = lens[i];
2390       for (j = 0; j < jmax; j++) {
2391         row  = irow_i[j];
2392         proc = row2proc_i[j];
2393         if (proc == rank) {
2394           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2395           if (!allcolumns[i]) {
2396             for (k = 0; k < ncols; k++) {
2397 #if defined(PETSC_USE_CTABLE)
2398               PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol));
2399 #else
2400               tcol = cmap_i[cols[k]];
2401 #endif
2402               if (tcol) lens_i[j]++;
2403             }
2404           } else { /* allcolumns */
2405             lens_i[j] = ncols;
2406           }
2407           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2408         }
2409       }
2410     }
2411 
2412     /* Create row map: global row of C -> local row of submatrices */
2413 #if defined(PETSC_USE_CTABLE)
2414     for (i = 0; i < ismax; i++) {
2415       PetscCall(PetscTableCreate(nrow[i], C->rmap->N, &rmap[i]));
2416       irow_i = irow[i];
2417       jmax   = nrow[i];
2418       for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES));
2419     }
2420 #else
2421     for (i = 0; i < ismax; i++) {
2422       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2423       rmap_i = rmap[i];
2424       irow_i = irow[i];
2425       jmax   = nrow[i];
2426       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2427     }
2428 #endif
2429 
2430     /* Update lens from offproc data */
2431     {
2432       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2433 
2434       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2435       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2436         sbuf1_i = sbuf1[pa[tmp2]];
2437         jmax    = sbuf1_i[0];
2438         ct1     = 2 * jmax + 1;
2439         ct2     = 0;
2440         rbuf2_i = rbuf2[tmp2];
2441         rbuf3_i = rbuf3[tmp2];
2442         for (j = 1; j <= jmax; j++) {
2443           is_no  = sbuf1_i[2 * j - 1];
2444           max1   = sbuf1_i[2 * j];
2445           lens_i = lens[is_no];
2446           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2447           rmap_i = rmap[is_no];
2448           for (k = 0; k < max1; k++, ct1++) {
2449 #if defined(PETSC_USE_CTABLE)
2450             PetscCall(PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row));
2451             row--;
2452             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2453 #else
2454             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2455 #endif
2456             max2 = rbuf2_i[ct1];
2457             for (l = 0; l < max2; l++, ct2++) {
2458               if (!allcolumns[is_no]) {
2459 #if defined(PETSC_USE_CTABLE)
2460                 PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol));
2461 #else
2462                 tcol = cmap_i[rbuf3_i[ct2]];
2463 #endif
2464                 if (tcol) lens_i[row]++;
2465               } else {         /* allcolumns */
2466                 lens_i[row]++; /* lens_i[row] += max2 ? */
2467               }
2468             }
2469           }
2470         }
2471       }
2472     }
2473     PetscCall(PetscFree(r_waits3));
2474     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2475     PetscCall(PetscFree(s_waits3));
2476 
2477     /* Create the submatrices */
2478     for (i = 0; i < ismax; i++) {
2479       PetscInt rbs, cbs;
2480 
2481       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2482       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2483 
2484       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2485       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2486 
2487       PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2488       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2489       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2490 
2491       /* create struct Mat_SubSppt and attached it to submat */
2492       PetscCall(PetscNew(&smat_i));
2493       subc            = (Mat_SeqAIJ *)submats[i]->data;
2494       subc->submatis1 = smat_i;
2495 
2496       smat_i->destroy          = submats[i]->ops->destroy;
2497       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2498       submats[i]->factortype   = C->factortype;
2499 
2500       smat_i->id          = i;
2501       smat_i->nrqs        = nrqs;
2502       smat_i->nrqr        = nrqr;
2503       smat_i->rbuf1       = rbuf1;
2504       smat_i->rbuf2       = rbuf2;
2505       smat_i->rbuf3       = rbuf3;
2506       smat_i->sbuf2       = sbuf2;
2507       smat_i->req_source2 = req_source2;
2508 
2509       smat_i->sbuf1 = sbuf1;
2510       smat_i->ptr   = ptr;
2511       smat_i->tmp   = tmp;
2512       smat_i->ctr   = ctr;
2513 
2514       smat_i->pa          = pa;
2515       smat_i->req_size    = req_size;
2516       smat_i->req_source1 = req_source1;
2517 
2518       smat_i->allcolumns = allcolumns[i];
2519       smat_i->singleis   = PETSC_FALSE;
2520       smat_i->row2proc   = row2proc[i];
2521       smat_i->rmap       = rmap[i];
2522       smat_i->cmap       = cmap[i];
2523     }
2524 
2525     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2526       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2527       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2528       PetscCall(MatSetType(submats[0], MATDUMMY));
2529 
2530       /* create struct Mat_SubSppt and attached it to submat */
2531       PetscCall(PetscNewLog(submats[0], &smat_i));
2532       submats[0]->data = (void *)smat_i;
2533 
2534       smat_i->destroy          = submats[0]->ops->destroy;
2535       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2536       submats[0]->factortype   = C->factortype;
2537 
2538       smat_i->id          = 0;
2539       smat_i->nrqs        = nrqs;
2540       smat_i->nrqr        = nrqr;
2541       smat_i->rbuf1       = rbuf1;
2542       smat_i->rbuf2       = rbuf2;
2543       smat_i->rbuf3       = rbuf3;
2544       smat_i->sbuf2       = sbuf2;
2545       smat_i->req_source2 = req_source2;
2546 
2547       smat_i->sbuf1 = sbuf1;
2548       smat_i->ptr   = ptr;
2549       smat_i->tmp   = tmp;
2550       smat_i->ctr   = ctr;
2551 
2552       smat_i->pa          = pa;
2553       smat_i->req_size    = req_size;
2554       smat_i->req_source1 = req_source1;
2555 
2556       smat_i->allcolumns = PETSC_FALSE;
2557       smat_i->singleis   = PETSC_FALSE;
2558       smat_i->row2proc   = NULL;
2559       smat_i->rmap       = NULL;
2560       smat_i->cmap       = NULL;
2561     }
2562 
2563     if (ismax) PetscCall(PetscFree(lens[0]));
2564     PetscCall(PetscFree(lens));
2565     if (sbuf_aj) {
2566       PetscCall(PetscFree(sbuf_aj[0]));
2567       PetscCall(PetscFree(sbuf_aj));
2568     }
2569 
2570   } /* endof scall == MAT_INITIAL_MATRIX */
2571 
2572   /* Post recv matrix values */
2573   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2574   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2575   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2576   for (i = 0; i < nrqs; ++i) {
2577     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2578     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2579   }
2580 
2581   /* Allocate sending buffers for a->a, and send them off */
2582   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2583   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2584   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2585   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2586 
2587   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2588   {
2589     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2590     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2591     PetscInt     cend = C->cmap->rend;
2592     PetscInt    *b_j  = b->j;
2593     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2594 
2595     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2596     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2597     for (i = 0; i < nrqr; i++) {
2598       rbuf1_i   = rbuf1[i];
2599       sbuf_aa_i = sbuf_aa[i];
2600       ct1       = 2 * rbuf1_i[0] + 1;
2601       ct2       = 0;
2602       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2603         kmax = rbuf1_i[2 * j];
2604         for (k = 0; k < kmax; k++, ct1++) {
2605           row    = rbuf1_i[ct1] - rstart;
2606           nzA    = a_i[row + 1] - a_i[row];
2607           nzB    = b_i[row + 1] - b_i[row];
2608           ncols  = nzA + nzB;
2609           cworkB = b_j + b_i[row];
2610           vworkA = a_a + a_i[row];
2611           vworkB = b_a + b_i[row];
2612 
2613           /* load the column values for this row into vals*/
2614           vals = sbuf_aa_i + ct2;
2615 
2616           lwrite = 0;
2617           for (l = 0; l < nzB; l++) {
2618             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2619           }
2620           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2621           for (l = 0; l < nzB; l++) {
2622             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2623           }
2624 
2625           ct2 += ncols;
2626         }
2627       }
2628       PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2629     }
2630     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2631     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2632   }
2633 
2634   /* Assemble the matrices */
2635   /* First assemble the local rows */
2636   for (i = 0; i < ismax; i++) {
2637     row2proc_i = row2proc[i];
2638     subc       = (Mat_SeqAIJ *)submats[i]->data;
2639     imat_ilen  = subc->ilen;
2640     imat_j     = subc->j;
2641     imat_i     = subc->i;
2642     imat_a     = subc->a;
2643 
2644     if (!allcolumns[i]) cmap_i = cmap[i];
2645     rmap_i = rmap[i];
2646     irow_i = irow[i];
2647     jmax   = nrow[i];
2648     for (j = 0; j < jmax; j++) {
2649       row  = irow_i[j];
2650       proc = row2proc_i[j];
2651       if (proc == rank) {
2652         old_row = row;
2653 #if defined(PETSC_USE_CTABLE)
2654         PetscCall(PetscTableFind(rmap_i, row + 1, &row));
2655         row--;
2656 #else
2657         row = rmap_i[row];
2658 #endif
2659         ilen_row = imat_ilen[row];
2660         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2661         mat_i = imat_i[row];
2662         mat_a = imat_a + mat_i;
2663         mat_j = imat_j + mat_i;
2664         if (!allcolumns[i]) {
2665           for (k = 0; k < ncols; k++) {
2666 #if defined(PETSC_USE_CTABLE)
2667             PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol));
2668 #else
2669             tcol = cmap_i[cols[k]];
2670 #endif
2671             if (tcol) {
2672               *mat_j++ = tcol - 1;
2673               *mat_a++ = vals[k];
2674               ilen_row++;
2675             }
2676           }
2677         } else { /* allcolumns */
2678           for (k = 0; k < ncols; k++) {
2679             *mat_j++ = cols[k]; /* global col index! */
2680             *mat_a++ = vals[k];
2681             ilen_row++;
2682           }
2683         }
2684         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2685 
2686         imat_ilen[row] = ilen_row;
2687       }
2688     }
2689   }
2690 
2691   /* Now assemble the off proc rows */
2692   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2693   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2694     sbuf1_i = sbuf1[pa[tmp2]];
2695     jmax    = sbuf1_i[0];
2696     ct1     = 2 * jmax + 1;
2697     ct2     = 0;
2698     rbuf2_i = rbuf2[tmp2];
2699     rbuf3_i = rbuf3[tmp2];
2700     rbuf4_i = rbuf4[tmp2];
2701     for (j = 1; j <= jmax; j++) {
2702       is_no  = sbuf1_i[2 * j - 1];
2703       rmap_i = rmap[is_no];
2704       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2705       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2706       imat_ilen = subc->ilen;
2707       imat_j    = subc->j;
2708       imat_i    = subc->i;
2709       imat_a    = subc->a;
2710       max1      = sbuf1_i[2 * j];
2711       for (k = 0; k < max1; k++, ct1++) {
2712         row = sbuf1_i[ct1];
2713 #if defined(PETSC_USE_CTABLE)
2714         PetscCall(PetscTableFind(rmap_i, row + 1, &row));
2715         row--;
2716 #else
2717         row = rmap_i[row];
2718 #endif
2719         ilen  = imat_ilen[row];
2720         mat_i = imat_i[row];
2721         mat_a = imat_a + mat_i;
2722         mat_j = imat_j + mat_i;
2723         max2  = rbuf2_i[ct1];
2724         if (!allcolumns[is_no]) {
2725           for (l = 0; l < max2; l++, ct2++) {
2726 #if defined(PETSC_USE_CTABLE)
2727             PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol));
2728 #else
2729             tcol = cmap_i[rbuf3_i[ct2]];
2730 #endif
2731             if (tcol) {
2732               *mat_j++ = tcol - 1;
2733               *mat_a++ = rbuf4_i[ct2];
2734               ilen++;
2735             }
2736           }
2737         } else { /* allcolumns */
2738           for (l = 0; l < max2; l++, ct2++) {
2739             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2740             *mat_a++ = rbuf4_i[ct2];
2741             ilen++;
2742           }
2743         }
2744         imat_ilen[row] = ilen;
2745       }
2746     }
2747   }
2748 
2749   if (!iscsorted) { /* sort column indices of the rows */
2750     for (i = 0; i < ismax; i++) {
2751       subc      = (Mat_SeqAIJ *)submats[i]->data;
2752       imat_j    = subc->j;
2753       imat_i    = subc->i;
2754       imat_a    = subc->a;
2755       imat_ilen = subc->ilen;
2756 
2757       if (allcolumns[i]) continue;
2758       jmax = nrow[i];
2759       for (j = 0; j < jmax; j++) {
2760         mat_i = imat_i[j];
2761         mat_a = imat_a + mat_i;
2762         mat_j = imat_j + mat_i;
2763         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2764       }
2765     }
2766   }
2767 
2768   PetscCall(PetscFree(r_waits4));
2769   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2770   PetscCall(PetscFree(s_waits4));
2771 
2772   /* Restore the indices */
2773   for (i = 0; i < ismax; i++) {
2774     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2775     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2776   }
2777 
2778   for (i = 0; i < ismax; i++) {
2779     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2780     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2781   }
2782 
2783   /* Destroy allocated memory */
2784   if (sbuf_aa) {
2785     PetscCall(PetscFree(sbuf_aa[0]));
2786     PetscCall(PetscFree(sbuf_aa));
2787   }
2788   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2789 
2790   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2791   PetscCall(PetscFree(rbuf4));
2792 
2793   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2794   PetscFunctionReturn(0);
2795 }
2796 
2797 /*
2798  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2799  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2800  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2801  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2802  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2803  state, and needs to be "assembled" later by compressing B's column space.
2804 
2805  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2806  Following this call, C->A & C->B have been created, even if empty.
2807  */
2808 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) {
2809   /* If making this function public, change the error returned in this function away from _PLIB. */
2810   Mat_MPIAIJ     *aij;
2811   Mat_SeqAIJ     *Baij;
2812   PetscBool       seqaij, Bdisassembled;
2813   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2814   PetscScalar     v;
2815   const PetscInt *rowindices, *colindices;
2816 
2817   PetscFunctionBegin;
2818   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2819   if (A) {
2820     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2821     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2822     if (rowemb) {
2823       PetscCall(ISGetLocalSize(rowemb, &m));
2824       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2825     } else {
2826       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2827     }
2828     if (dcolemb) {
2829       PetscCall(ISGetLocalSize(dcolemb, &n));
2830       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2831     } else {
2832       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2833     }
2834   }
2835   if (B) {
2836     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2837     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2838     if (rowemb) {
2839       PetscCall(ISGetLocalSize(rowemb, &m));
2840       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2841     } else {
2842       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2843     }
2844     if (ocolemb) {
2845       PetscCall(ISGetLocalSize(ocolemb, &n));
2846       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2847     } else {
2848       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2849     }
2850   }
2851 
2852   aij = (Mat_MPIAIJ *)C->data;
2853   if (!aij->A) {
2854     /* Mimic parts of MatMPIAIJSetPreallocation() */
2855     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2856     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2857     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2858     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2859     PetscCall(PetscLogObjectParent((PetscObject)C, (PetscObject)aij->A));
2860   }
2861   if (A) {
2862     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2863   } else {
2864     PetscCall(MatSetUp(aij->A));
2865   }
2866   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2867     /*
2868       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2869       need to "disassemble" B -- convert it to using C's global indices.
2870       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2871 
2872       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2873       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2874 
2875       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2876       At least avoid calling MatSetValues() and the implied searches?
2877     */
2878 
2879     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2880 #if defined(PETSC_USE_CTABLE)
2881       PetscCall(PetscTableDestroy(&aij->colmap));
2882 #else
2883       PetscCall(PetscFree(aij->colmap));
2884       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2885       if (aij->B) PetscCall(PetscLogObjectMemory((PetscObject)C, -aij->B->cmap->n * sizeof(PetscInt)));
2886 #endif
2887       ngcol = 0;
2888       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2889       if (aij->garray) {
2890         PetscCall(PetscFree(aij->garray));
2891         PetscCall(PetscLogObjectMemory((PetscObject)C, -ngcol * sizeof(PetscInt)));
2892       }
2893       PetscCall(VecDestroy(&aij->lvec));
2894       PetscCall(VecScatterDestroy(&aij->Mvctx));
2895     }
2896     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2897     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2898   }
2899   Bdisassembled = PETSC_FALSE;
2900   if (!aij->B) {
2901     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2902     PetscCall(PetscLogObjectParent((PetscObject)C, (PetscObject)aij->B));
2903     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2904     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2905     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2906     Bdisassembled = PETSC_TRUE;
2907   }
2908   if (B) {
2909     Baij = (Mat_SeqAIJ *)B->data;
2910     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2911       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2912       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2913       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2914       PetscCall(PetscFree(nz));
2915     }
2916 
2917     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2918     shift      = rend - rstart;
2919     count      = 0;
2920     rowindices = NULL;
2921     colindices = NULL;
2922     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2923     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2924     for (i = 0; i < B->rmap->n; i++) {
2925       PetscInt row;
2926       row = i;
2927       if (rowindices) row = rowindices[i];
2928       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2929         col = Baij->j[count];
2930         if (colindices) col = colindices[col];
2931         if (Bdisassembled && col >= rstart) col += shift;
2932         v = Baij->a[count];
2933         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2934         ++count;
2935       }
2936     }
2937     /* No assembly for aij->B is necessary. */
2938     /* FIXME: set aij->B's nonzerostate correctly. */
2939   } else {
2940     PetscCall(MatSetUp(aij->B));
2941   }
2942   C->preallocated  = PETSC_TRUE;
2943   C->was_assembled = PETSC_FALSE;
2944   C->assembled     = PETSC_FALSE;
2945   /*
2946       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2947       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2948    */
2949   PetscFunctionReturn(0);
2950 }
2951 
2952 /*
2953   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2954  */
2955 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) {
2956   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
2957 
2958   PetscFunctionBegin;
2959   PetscValidPointer(A, 2);
2960   PetscValidPointer(B, 3);
2961   /* FIXME: make sure C is assembled */
2962   *A = aij->A;
2963   *B = aij->B;
2964   /* Note that we don't incref *A and *B, so be careful! */
2965   PetscFunctionReturn(0);
2966 }
2967 
2968 /*
2969   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2970   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2971 */
2972 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) {
2973   PetscMPIInt size, flag;
2974   PetscInt    i, ii, cismax, ispar;
2975   Mat        *A, *B;
2976   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
2977 
2978   PetscFunctionBegin;
2979   if (!ismax) PetscFunctionReturn(0);
2980 
2981   for (i = 0, cismax = 0; i < ismax; ++i) {
2982     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
2983     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2984     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
2985     if (size > 1) ++cismax;
2986   }
2987 
2988   /*
2989      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
2990      ispar counts the number of parallel ISs across C's comm.
2991   */
2992   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
2993   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
2994     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
2995     PetscFunctionReturn(0);
2996   }
2997 
2998   /* if (ispar) */
2999   /*
3000     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3001     These are used to extract the off-diag portion of the resulting parallel matrix.
3002     The row IS for the off-diag portion is the same as for the diag portion,
3003     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3004   */
3005   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3006   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3007   for (i = 0, ii = 0; i < ismax; ++i) {
3008     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3009     if (size > 1) {
3010       /*
3011          TODO: This is the part that's ***NOT SCALABLE***.
3012          To fix this we need to extract just the indices of C's nonzero columns
3013          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3014          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3015          to be done without serializing on the IS list, so, most likely, it is best
3016          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3017       */
3018       PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii])));
3019       /* Now we have to
3020          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3021              were sorted on each rank, concatenated they might no longer be sorted;
3022          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3023              indices in the nondecreasing order to the original index positions.
3024          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3025       */
3026       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3027       PetscCall(ISSort(ciscol[ii]));
3028       ++ii;
3029     }
3030   }
3031   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3032   for (i = 0, ii = 0; i < ismax; ++i) {
3033     PetscInt        j, issize;
3034     const PetscInt *indices;
3035 
3036     /*
3037        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3038      */
3039     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3040     PetscCall(ISSort(isrow[i]));
3041     PetscCall(ISGetLocalSize(isrow[i], &issize));
3042     PetscCall(ISGetIndices(isrow[i], &indices));
3043     for (j = 1; j < issize; ++j) {
3044       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3045     }
3046     PetscCall(ISRestoreIndices(isrow[i], &indices));
3047     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3048     PetscCall(ISSort(iscol[i]));
3049     PetscCall(ISGetLocalSize(iscol[i], &issize));
3050     PetscCall(ISGetIndices(iscol[i], &indices));
3051     for (j = 1; j < issize; ++j) {
3052       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3053     }
3054     PetscCall(ISRestoreIndices(iscol[i], &indices));
3055     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3056     if (size > 1) {
3057       cisrow[ii] = isrow[i];
3058       ++ii;
3059     }
3060   }
3061   /*
3062     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3063     array of sequential matrices underlying the resulting parallel matrices.
3064     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3065     contain duplicates.
3066 
3067     There are as many diag matrices as there are original index sets. There are only as many parallel
3068     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3069 
3070     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3071     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3072       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3073       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3074       with A[i] and B[ii] extracted from the corresponding MPI submat.
3075     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3076       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3077       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3078       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3079       values into A[i] and B[ii] sitting inside the corresponding submat.
3080     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3081       A[i], B[ii], AA[i] or BB[ii] matrices.
3082   */
3083   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3084   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3085 
3086   /* Now obtain the sequential A and B submatrices separately. */
3087   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3088   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3089   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3090 
3091   /*
3092     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3093     matrices A & B have been extracted directly into the parallel matrices containing them, or
3094     simply into the sequential matrix identical with the corresponding A (if size == 1).
3095     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3096     to have the same sparsity pattern.
3097     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3098     must be constructed for C. This is done by setseqmat(s).
3099   */
3100   for (i = 0, ii = 0; i < ismax; ++i) {
3101     /*
3102        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3103        That way we can avoid sorting and computing permutations when reusing.
3104        To this end:
3105         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3106         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3107           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3108     */
3109     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3110 
3111     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3112     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3113     if (size > 1) {
3114       if (scall == MAT_INITIAL_MATRIX) {
3115         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3116         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3117         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3118         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3119         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3120       }
3121       /*
3122         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3123       */
3124       {
3125         Mat AA = A[i], BB = B[ii];
3126 
3127         if (AA || BB) {
3128           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3129           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3130           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3131         }
3132         PetscCall(MatDestroy(&AA));
3133       }
3134       PetscCall(ISDestroy(ciscol + ii));
3135       PetscCall(ISDestroy(ciscol_p + ii));
3136       ++ii;
3137     } else { /* if (size == 1) */
3138       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3139       if (isrow_p[i] || iscol_p[i]) {
3140         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3141         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3142         /* Otherwise A is extracted straight into (*submats)[i]. */
3143         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3144         PetscCall(MatDestroy(A + i));
3145       } else (*submat)[i] = A[i];
3146     }
3147     PetscCall(ISDestroy(&isrow_p[i]));
3148     PetscCall(ISDestroy(&iscol_p[i]));
3149   }
3150   PetscCall(PetscFree2(cisrow, ciscol));
3151   PetscCall(PetscFree2(isrow_p, iscol_p));
3152   PetscCall(PetscFree(ciscol_p));
3153   PetscCall(PetscFree(A));
3154   PetscCall(MatDestroySubMatrices(cismax, &B));
3155   PetscFunctionReturn(0);
3156 }
3157 
3158 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) {
3159   PetscFunctionBegin;
3160   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3161   PetscFunctionReturn(0);
3162 }
3163