xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision df4cd43f92eaa320656440c40edb1046daee8f75)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
20 
21 /*
22    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
23 
24    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25    that is a very major recoding job.
26 
27    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28 */
29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30 {
31   PetscFunctionBegin;
32   for (PetscInt i = 0; i < imax; i++) {
33     if (!is[i]) break;
34     PetscInt        n = 0, N, Nmax, Nmin;
35     const PetscInt *idx;
36     PetscInt       *nidx = NULL;
37     MPI_Comm        comm;
38     PetscBT         bt;
39 
40     PetscCall(ISGetLocalSize(is[i], &N));
41     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42       PetscCall(ISGetIndices(is[i], &idx));
43       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44       Nmin = Nmin / bs;
45       Nmax = Nmax / bs;
46       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47       for (PetscInt j = 0; j < N; j++) {
48         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49       }
50       PetscCall(PetscMalloc1(n, &nidx));
51       n = 0;
52       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53       for (PetscInt j = 0; j < N; j++) {
54         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55       }
56       PetscCall(PetscBTDestroy(&bt));
57       PetscCall(ISRestoreIndices(is[i], &idx));
58     }
59     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60     PetscCall(ISDestroy(is + i));
61     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67 {
68   PetscInt i;
69 
70   PetscFunctionBegin;
71   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74   PetscFunctionReturn(PETSC_SUCCESS);
75 }
76 
77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78 {
79   PetscInt i;
80 
81   PetscFunctionBegin;
82   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89 {
90   MPI_Comm        comm;
91   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
92   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94   const PetscInt *indices_i, **indices;
95   PetscLayout     rmap;
96   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
97   PetscSF         sf;
98   PetscSFNode    *remote;
99 
100   PetscFunctionBegin;
101   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103   PetscCallMPI(MPI_Comm_size(comm, &size));
104   /* get row map to determine where rows should be going */
105   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106   /* retrieve IS data and put all together so that we
107    * can optimize communication
108    *  */
109   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110   for (i = 0, tlength = 0; i < nidx; i++) {
111     PetscCall(ISGetLocalSize(is[i], &length[i]));
112     tlength += length[i];
113     PetscCall(ISGetIndices(is[i], &indices[i]));
114   }
115   /* find these rows on remote processors */
116   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
117   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
118   nrrows = 0;
119   for (i = 0; i < nidx; i++) {
120     length_i  = length[i];
121     indices_i = indices[i];
122     for (j = 0; j < length_i; j++) {
123       owner = -1;
124       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
125       /* remote processors */
126       if (owner != rank) {
127         tosizes_temp[owner]++;               /* number of rows to owner */
128         rrow_ranks[nrrows]   = owner;        /* processor */
129         rrow_isids[nrrows]   = i;            /* is id */
130         remoterows[nrrows++] = indices_i[j]; /* row */
131       }
132     }
133     PetscCall(ISRestoreIndices(is[i], &indices[i]));
134   }
135   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
136   /* test if we need to exchange messages
137    * generally speaking, we do not need to exchange
138    * data when overlap is 1
139    * */
140   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
141   /* we do not have any messages
142    * It usually corresponds to overlap 1
143    * */
144   if (!reducednrrows) {
145     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
146     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
147     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
148     PetscFunctionReturn(PETSC_SUCCESS);
149   }
150   nto = 0;
151   /* send sizes and ranks for building a two-sided communication */
152   for (i = 0; i < size; i++) {
153     if (tosizes_temp[i]) {
154       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
155       tosizes_temp[i]  = nto;                 /* a map from processor to index */
156       toranks[nto++]   = i;                   /* processor */
157     }
158   }
159   PetscCall(PetscMalloc1(nto + 1, &toffsets));
160   toffsets[0] = 0;
161   for (i = 0; i < nto; i++) {
162     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
163     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
164   }
165   /* send information to other processors */
166   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
167   nrecvrows = 0;
168   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
169   PetscCall(PetscMalloc1(nrecvrows, &remote));
170   nrecvrows = 0;
171   for (i = 0; i < nfrom; i++) {
172     for (j = 0; j < fromsizes[2 * i]; j++) {
173       remote[nrecvrows].rank    = fromranks[i];
174       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
175     }
176   }
177   PetscCall(PetscSFCreate(comm, &sf));
178   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
179   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
180   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
181   PetscCall(PetscSFSetFromOptions(sf));
182   /* message pair <no of is, row>  */
183   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
184   for (i = 0; i < nrrows; i++) {
185     owner                 = rrow_ranks[i];       /* processor */
186     j                     = tosizes_temp[owner]; /* index */
187     todata[toffsets[j]++] = rrow_isids[i];
188     todata[toffsets[j]++] = remoterows[i];
189   }
190   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
191   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
192   PetscCall(PetscFree(toffsets));
193   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
194   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195   PetscCall(PetscSFDestroy(&sf));
196   /* send rows belonging to the remote so that then we could get the overlapping data back */
197   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
198   PetscCall(PetscFree2(todata, fromdata));
199   PetscCall(PetscFree(fromsizes));
200   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
201   PetscCall(PetscFree(fromranks));
202   nrecvrows = 0;
203   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
204   PetscCall(PetscCalloc1(nrecvrows, &todata));
205   PetscCall(PetscMalloc1(nrecvrows, &remote));
206   nrecvrows = 0;
207   for (i = 0; i < nto; i++) {
208     for (j = 0; j < tosizes[2 * i]; j++) {
209       remote[nrecvrows].rank    = toranks[i];
210       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
211     }
212   }
213   PetscCall(PetscSFCreate(comm, &sf));
214   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
215   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
216   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
217   PetscCall(PetscSFSetFromOptions(sf));
218   /* overlap communication and computation */
219   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
220   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
221   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
222   PetscCall(PetscSFDestroy(&sf));
223   PetscCall(PetscFree2(sbdata, sbsizes));
224   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
225   PetscCall(PetscFree(toranks));
226   PetscCall(PetscFree(tosizes));
227   PetscCall(PetscFree(todata));
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
232 {
233   PetscInt       *isz, isz_i, i, j, is_id, data_size;
234   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
235   const PetscInt *indices_i_temp;
236   MPI_Comm       *iscomms;
237 
238   PetscFunctionBegin;
239   max_lsize = 0;
240   PetscCall(PetscMalloc1(nidx, &isz));
241   for (i = 0; i < nidx; i++) {
242     PetscCall(ISGetLocalSize(is[i], &lsize));
243     max_lsize = lsize > max_lsize ? lsize : max_lsize;
244     isz[i]    = lsize;
245   }
246   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
247   for (i = 0; i < nidx; i++) {
248     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
249     PetscCall(ISGetIndices(is[i], &indices_i_temp));
250     PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]));
251     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
252     PetscCall(ISDestroy(&is[i]));
253   }
254   /* retrieve information to get row id and its overlap */
255   for (i = 0; i < nrecvs;) {
256     is_id     = recvdata[i++];
257     data_size = recvdata[i++];
258     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
259     isz_i     = isz[is_id];
260     for (j = 0; j < data_size; j++) {
261       col                = recvdata[i++];
262       indices_i[isz_i++] = col;
263     }
264     isz[is_id] = isz_i;
265   }
266   /* remove duplicate entities */
267   for (i = 0; i < nidx; i++) {
268     indices_i = indices_temp + (max_lsize + nrecvs) * i;
269     isz_i     = isz[i];
270     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
271     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
272     PetscCall(PetscCommDestroy(&iscomms[i]));
273   }
274   PetscCall(PetscFree(isz));
275   PetscCall(PetscFree2(indices_temp, iscomms));
276   PetscFunctionReturn(PETSC_SUCCESS);
277 }
278 
279 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
280 {
281   PetscLayout     rmap, cmap;
282   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
283   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
284   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
285   const PetscInt *gcols, *ai, *aj, *bi, *bj;
286   Mat             amat, bmat;
287   PetscMPIInt     rank;
288   PetscBool       done;
289   MPI_Comm        comm;
290 
291   PetscFunctionBegin;
292   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
293   PetscCallMPI(MPI_Comm_rank(comm, &rank));
294   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
295   /* Even if the mat is symmetric, we still assume it is not symmetric */
296   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
297   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
298   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
299   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
300   /* total number of nonzero values is used to estimate the memory usage in the next step */
301   tnz = ai[an] + bi[bn];
302   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
303   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
304   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
305   /* to find the longest message */
306   max_fszs = 0;
307   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
308   /* better way to estimate number of nonzero in the mat??? */
309   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
310   for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
311   rows_pos  = 0;
312   totalrows = 0;
313   for (i = 0; i < nfrom; i++) {
314     PetscCall(PetscArrayzero(rows_pos_i, nidx));
315     /* group data together */
316     for (j = 0; j < fromsizes[2 * i]; j += 2) {
317       is_id                       = fromrows[rows_pos++]; /* no of is */
318       rows_i                      = rows_data[is_id];
319       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
320     }
321     /* estimate a space to avoid multiple allocations  */
322     for (j = 0; j < nidx; j++) {
323       indvc_ij = 0;
324       rows_i   = rows_data[j];
325       for (l = 0; l < rows_pos_i[j]; l++) {
326         row   = rows_i[l] - rstart;
327         start = ai[row];
328         end   = ai[row + 1];
329         for (k = start; k < end; k++) { /* Amat */
330           col                     = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
332         }
333         start = bi[row];
334         end   = bi[row + 1];
335         for (k = start; k < end; k++) { /* Bmat */
336           col                     = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
341       indv_counts[i * nidx + j] = indvc_ij;
342       totalrows += indvc_ij;
343     }
344   }
345   /* message triple <no of is, number of rows, rows> */
346   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
347   totalrows = 0;
348   rows_pos  = 0;
349   /* use this code again */
350   for (i = 0; i < nfrom; i++) {
351     PetscCall(PetscArrayzero(rows_pos_i, nidx));
352     for (j = 0; j < fromsizes[2 * i]; j += 2) {
353       is_id                       = fromrows[rows_pos++];
354       rows_i                      = rows_data[is_id];
355       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
356     }
357     /* add data  */
358     for (j = 0; j < nidx; j++) {
359       if (!indv_counts[i * nidx + j]) continue;
360       indvc_ij            = 0;
361       sbdata[totalrows++] = j;
362       sbdata[totalrows++] = indv_counts[i * nidx + j];
363       sbsizes[2 * i] += 2;
364       rows_i = rows_data[j];
365       for (l = 0; l < rows_pos_i[j]; l++) {
366         row   = rows_i[l] - rstart;
367         start = ai[row];
368         end   = ai[row + 1];
369         for (k = start; k < end; k++) { /* Amat */
370           col                     = aj[k] + cstart;
371           indices_tmp[indvc_ij++] = col;
372         }
373         start = bi[row];
374         end   = bi[row + 1];
375         for (k = start; k < end; k++) { /* Bmat */
376           col                     = gcols[bj[k]];
377           indices_tmp[indvc_ij++] = col;
378         }
379       }
380       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
381       sbsizes[2 * i] += indvc_ij;
382       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
383       totalrows += indvc_ij;
384     }
385   }
386   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
387   offsets[0] = 0;
388   for (i = 0; i < nfrom; i++) {
389     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
390     sbsizes[2 * i + 1] = offsets[i];
391   }
392   PetscCall(PetscFree(offsets));
393   if (sbrowsizes) *sbrowsizes = sbsizes;
394   if (sbrows) *sbrows = sbdata;
395   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
396   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
397   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
398   PetscFunctionReturn(PETSC_SUCCESS);
399 }
400 
401 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
402 {
403   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
404   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
405   PetscInt        lsize, lsize_tmp;
406   PetscMPIInt     rank, owner;
407   Mat             amat, bmat;
408   PetscBool       done;
409   PetscLayout     cmap, rmap;
410   MPI_Comm        comm;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
414   PetscCallMPI(MPI_Comm_rank(comm, &rank));
415   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
416   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
418   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
419   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
420   /* is it a safe way to compute number of nonzero values ? */
421   tnz = ai[an] + bi[bn];
422   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
423   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
424   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
425   /* it is a better way to estimate memory than the old implementation
426    * where global size of matrix is used
427    * */
428   PetscCall(PetscMalloc1(tnz, &indices_temp));
429   for (i = 0; i < nidx; i++) {
430     MPI_Comm iscomm;
431 
432     PetscCall(ISGetLocalSize(is[i], &lsize));
433     PetscCall(ISGetIndices(is[i], &indices));
434     lsize_tmp = 0;
435     for (j = 0; j < lsize; j++) {
436       owner = -1;
437       row   = indices[j];
438       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
439       if (owner != rank) continue;
440       /* local number */
441       row -= rstart;
442       start = ai[row];
443       end   = ai[row + 1];
444       for (k = start; k < end; k++) { /* Amat */
445         col                       = aj[k] + cstart;
446         indices_temp[lsize_tmp++] = col;
447       }
448       start = bi[row];
449       end   = bi[row + 1];
450       for (k = start; k < end; k++) { /* Bmat */
451         col                       = gcols[bj[k]];
452         indices_temp[lsize_tmp++] = col;
453       }
454     }
455     PetscCall(ISRestoreIndices(is[i], &indices));
456     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
457     PetscCall(ISDestroy(&is[i]));
458     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
459     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
460     PetscCall(PetscCommDestroy(&iscomm));
461   }
462   PetscCall(PetscFree(indices_temp));
463   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
464   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
465   PetscFunctionReturn(PETSC_SUCCESS);
466 }
467 
468 /*
469   Sample message format:
470   If a processor A wants processor B to process some elements corresponding
471   to index sets is[1],is[5]
472   mesg [0] = 2   (no of index sets in the mesg)
473   -----------
474   mesg [1] = 1 => is[1]
475   mesg [2] = sizeof(is[1]);
476   -----------
477   mesg [3] = 5  => is[5]
478   mesg [4] = sizeof(is[5]);
479   -----------
480   mesg [5]
481   mesg [n]  datas[1]
482   -----------
483   mesg[n+1]
484   mesg[m]  data(is[5])
485   -----------
486 
487   Notes:
488   nrqs - no of requests sent (or to be sent out)
489   nrqr - no of requests received (which have to be or which have been processed)
490 */
491 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
492 {
493   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
494   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
495   const PetscInt **idx, *idx_i;
496   PetscInt        *n, **data, len;
497 #if defined(PETSC_USE_CTABLE)
498   PetscHMapI *table_data, table_data_i;
499   PetscInt   *tdata, tcount, tcount_max;
500 #else
501   PetscInt *data_i, *d_p;
502 #endif
503   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
504   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
505   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
506   PetscBT     *table;
507   MPI_Comm     comm;
508   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
509   MPI_Status  *recv_status;
510   MPI_Comm    *iscomms;
511   char        *t_p;
512 
513   PetscFunctionBegin;
514   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
515   size = c->size;
516   rank = c->rank;
517   M    = C->rmap->N;
518 
519   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
520   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
521 
522   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
523 
524   for (i = 0; i < imax; i++) {
525     PetscCall(ISGetIndices(is[i], &idx[i]));
526     PetscCall(ISGetLocalSize(is[i], &n[i]));
527   }
528 
529   /* evaluate communication - mesg to who,length of mesg, and buffer space
530      required. Based on this, buffers are allocated, and data copied into them  */
531   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
532   for (i = 0; i < imax; i++) {
533     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
534     idx_i = idx[i];
535     len   = n[i];
536     for (j = 0; j < len; j++) {
537       row = idx_i[j];
538       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
539       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
540       w4[proc]++;
541     }
542     for (j = 0; j < size; j++) {
543       if (w4[j]) {
544         w1[j] += w4[j];
545         w3[j]++;
546       }
547     }
548   }
549 
550   nrqs     = 0; /* no of outgoing messages */
551   msz      = 0; /* total mesg length (for all proc */
552   w1[rank] = 0; /* no mesg sent to intself */
553   w3[rank] = 0;
554   for (i = 0; i < size; i++) {
555     if (w1[i]) {
556       w2[i] = 1;
557       nrqs++;
558     } /* there exists a message to proc i */
559   }
560   /* pa - is list of processors to communicate with */
561   PetscCall(PetscMalloc1(nrqs, &pa));
562   for (i = 0, j = 0; i < size; i++) {
563     if (w1[i]) {
564       pa[j] = i;
565       j++;
566     }
567   }
568 
569   /* Each message would have a header = 1 + 2*(no of IS) + data */
570   for (i = 0; i < nrqs; i++) {
571     j = pa[i];
572     w1[j] += w2[j] + 2 * w3[j];
573     msz += w1[j];
574   }
575 
576   /* Determine the number of messages to expect, their lengths, from from-ids */
577   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
578   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
579 
580   /* Now post the Irecvs corresponding to these messages */
581   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
582 
583   /* Allocate Memory for outgoing messages */
584   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
585   PetscCall(PetscArrayzero(outdat, size));
586   PetscCall(PetscArrayzero(ptr, size));
587 
588   {
589     PetscInt *iptr = tmp, ict = 0;
590     for (i = 0; i < nrqs; i++) {
591       j = pa[i];
592       iptr += ict;
593       outdat[j] = iptr;
594       ict       = w1[j];
595     }
596   }
597 
598   /* Form the outgoing messages */
599   /* plug in the headers */
600   for (i = 0; i < nrqs; i++) {
601     j            = pa[i];
602     outdat[j][0] = 0;
603     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
604     ptr[j] = outdat[j] + 2 * w3[j] + 1;
605   }
606 
607   /* Memory for doing local proc's work */
608   {
609     PetscInt M_BPB_imax = 0;
610 #if defined(PETSC_USE_CTABLE)
611     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
612     PetscCall(PetscMalloc1(imax, &table_data));
613     for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
614     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
615     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
616 #else
617     PetscInt Mimax = 0;
618     PetscCall(PetscIntMultError(M, imax, &Mimax));
619     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
620     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
621     for (i = 0; i < imax; i++) {
622       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
623       data[i]  = d_p + M * i;
624     }
625 #endif
626   }
627 
628   /* Parse the IS and update local tables and the outgoing buf with the data */
629   {
630     PetscInt n_i, isz_i, *outdat_j, ctr_j;
631     PetscBT  table_i;
632 
633     for (i = 0; i < imax; i++) {
634       PetscCall(PetscArrayzero(ctr, size));
635       n_i     = n[i];
636       table_i = table[i];
637       idx_i   = idx[i];
638 #if defined(PETSC_USE_CTABLE)
639       table_data_i = table_data[i];
640 #else
641       data_i = data[i];
642 #endif
643       isz_i = isz[i];
644       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
645         row = idx_i[j];
646         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
647         if (proc != rank) { /* copy to the outgoing buffer */
648           ctr[proc]++;
649           *ptr[proc] = row;
650           ptr[proc]++;
651         } else if (!PetscBTLookupSet(table_i, row)) {
652 #if defined(PETSC_USE_CTABLE)
653           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
654 #else
655           data_i[isz_i] = row; /* Update the local table */
656 #endif
657           isz_i++;
658         }
659       }
660       /* Update the headers for the current IS */
661       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
662         if ((ctr_j = ctr[j])) {
663           outdat_j            = outdat[j];
664           k                   = ++outdat_j[0];
665           outdat_j[2 * k]     = ctr_j;
666           outdat_j[2 * k - 1] = i;
667         }
668       }
669       isz[i] = isz_i;
670     }
671   }
672 
673   /*  Now  post the sends */
674   PetscCall(PetscMalloc1(nrqs, &s_waits1));
675   for (i = 0; i < nrqs; ++i) {
676     j = pa[i];
677     PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
678   }
679 
680   /* No longer need the original indices */
681   PetscCall(PetscMalloc1(imax, &iscomms));
682   for (i = 0; i < imax; ++i) {
683     PetscCall(ISRestoreIndices(is[i], idx + i));
684     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
685   }
686   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
687 
688   for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
689 
690     /* Do Local work */
691 #if defined(PETSC_USE_CTABLE)
692   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
693 #else
694   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
695 #endif
696 
697   /* Receive messages */
698   PetscCall(PetscMalloc1(nrqr, &recv_status));
699   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
700   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
701 
702   /* Phase 1 sends are complete - deallocate buffers */
703   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
704   PetscCall(PetscFree4(w1, w2, w3, w4));
705 
706   PetscCall(PetscMalloc1(nrqr, &xdata));
707   PetscCall(PetscMalloc1(nrqr, &isz1));
708   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
709   PetscCall(PetscFree(rbuf[0]));
710   PetscCall(PetscFree(rbuf));
711 
712   /* Send the data back */
713   /* Do a global reduction to know the buffer space req for incoming messages */
714   {
715     PetscMPIInt *rw1;
716 
717     PetscCall(PetscCalloc1(size, &rw1));
718 
719     for (i = 0; i < nrqr; ++i) {
720       proc = recv_status[i].MPI_SOURCE;
721 
722       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
723       rw1[proc] = isz1[i];
724     }
725     PetscCall(PetscFree(onodes1));
726     PetscCall(PetscFree(olengths1));
727 
728     /* Determine the number of messages to expect, their lengths, from from-ids */
729     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
730     PetscCall(PetscFree(rw1));
731   }
732   /* Now post the Irecvs corresponding to these messages */
733   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
734 
735   /* Now  post the sends */
736   PetscCall(PetscMalloc1(nrqr, &s_waits2));
737   for (i = 0; i < nrqr; ++i) {
738     j = recv_status[i].MPI_SOURCE;
739     PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
740   }
741 
742   /* receive work done on other processors */
743   {
744     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
745     PetscMPIInt idex;
746     PetscBT     table_i;
747 
748     for (i = 0; i < nrqs; ++i) {
749       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
750       /* Process the message */
751       rbuf2_i = rbuf2[idex];
752       ct1     = 2 * rbuf2_i[0] + 1;
753       jmax    = rbuf2[idex][0];
754       for (j = 1; j <= jmax; j++) {
755         max     = rbuf2_i[2 * j];
756         is_no   = rbuf2_i[2 * j - 1];
757         isz_i   = isz[is_no];
758         table_i = table[is_no];
759 #if defined(PETSC_USE_CTABLE)
760         table_data_i = table_data[is_no];
761 #else
762         data_i = data[is_no];
763 #endif
764         for (k = 0; k < max; k++, ct1++) {
765           row = rbuf2_i[ct1];
766           if (!PetscBTLookupSet(table_i, row)) {
767 #if defined(PETSC_USE_CTABLE)
768             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
769 #else
770             data_i[isz_i] = row;
771 #endif
772             isz_i++;
773           }
774         }
775         isz[is_no] = isz_i;
776       }
777     }
778 
779     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
780   }
781 
782 #if defined(PETSC_USE_CTABLE)
783   tcount_max = 0;
784   for (i = 0; i < imax; ++i) {
785     table_data_i = table_data[i];
786     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
787     if (tcount_max < tcount) tcount_max = tcount;
788   }
789   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
790 #endif
791 
792   for (i = 0; i < imax; ++i) {
793 #if defined(PETSC_USE_CTABLE)
794     PetscHashIter tpos;
795 
796     table_data_i = table_data[i];
797     PetscHashIterBegin(table_data_i, tpos);
798     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
799       PetscHashIterGetKey(table_data_i, tpos, k);
800       PetscHashIterGetVal(table_data_i, tpos, j);
801       PetscHashIterNext(table_data_i, tpos);
802       tdata[--j] = --k;
803     }
804     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
805 #else
806     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
807 #endif
808     PetscCall(PetscCommDestroy(&iscomms[i]));
809   }
810 
811   PetscCall(PetscFree(iscomms));
812   PetscCall(PetscFree(onodes2));
813   PetscCall(PetscFree(olengths2));
814 
815   PetscCall(PetscFree(pa));
816   PetscCall(PetscFree(rbuf2[0]));
817   PetscCall(PetscFree(rbuf2));
818   PetscCall(PetscFree(s_waits1));
819   PetscCall(PetscFree(r_waits1));
820   PetscCall(PetscFree(s_waits2));
821   PetscCall(PetscFree(r_waits2));
822   PetscCall(PetscFree(recv_status));
823   if (xdata) {
824     PetscCall(PetscFree(xdata[0]));
825     PetscCall(PetscFree(xdata));
826   }
827   PetscCall(PetscFree(isz1));
828 #if defined(PETSC_USE_CTABLE)
829   for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
830   PetscCall(PetscFree(table_data));
831   PetscCall(PetscFree(tdata));
832   PetscCall(PetscFree4(table, data, isz, t_p));
833 #else
834   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
835 #endif
836   PetscFunctionReturn(PETSC_SUCCESS);
837 }
838 
839 /*
840    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
841        the work on the local processor.
842 
843      Inputs:
844       C      - MAT_MPIAIJ;
845       imax - total no of index sets processed at a time;
846       table  - an array of char - size = m bits.
847 
848      Output:
849       isz    - array containing the count of the solution elements corresponding
850                to each index set;
851       data or table_data  - pointer to the solutions
852 */
853 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
854 {
855   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
856   Mat         A = c->A, B = c->B;
857   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
858   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
859   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
860   PetscBT     table_i;
861 #if defined(PETSC_USE_CTABLE)
862   PetscHMapI    table_data_i;
863   PetscHashIter tpos;
864   PetscInt      tcount, *tdata;
865 #else
866   PetscInt *data_i;
867 #endif
868 
869   PetscFunctionBegin;
870   rstart = C->rmap->rstart;
871   cstart = C->cmap->rstart;
872   ai     = a->i;
873   aj     = a->j;
874   bi     = b->i;
875   bj     = b->j;
876   garray = c->garray;
877 
878   for (i = 0; i < imax; i++) {
879 #if defined(PETSC_USE_CTABLE)
880     /* copy existing entries of table_data_i into tdata[] */
881     table_data_i = table_data[i];
882     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
883     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
884 
885     PetscCall(PetscMalloc1(tcount, &tdata));
886     PetscHashIterBegin(table_data_i, tpos);
887     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
888       PetscHashIterGetKey(table_data_i, tpos, row);
889       PetscHashIterGetVal(table_data_i, tpos, j);
890       PetscHashIterNext(table_data_i, tpos);
891       tdata[--j] = --row;
892       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
893     }
894 #else
895     data_i = data[i];
896 #endif
897     table_i = table[i];
898     isz_i   = isz[i];
899     max     = isz[i];
900 
901     for (j = 0; j < max; j++) {
902 #if defined(PETSC_USE_CTABLE)
903       row = tdata[j] - rstart;
904 #else
905       row = data_i[j] - rstart;
906 #endif
907       start = ai[row];
908       end   = ai[row + 1];
909       for (k = start; k < end; k++) { /* Amat */
910         val = aj[k] + cstart;
911         if (!PetscBTLookupSet(table_i, val)) {
912 #if defined(PETSC_USE_CTABLE)
913           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
914 #else
915           data_i[isz_i] = val;
916 #endif
917           isz_i++;
918         }
919       }
920       start = bi[row];
921       end   = bi[row + 1];
922       for (k = start; k < end; k++) { /* Bmat */
923         val = garray[bj[k]];
924         if (!PetscBTLookupSet(table_i, val)) {
925 #if defined(PETSC_USE_CTABLE)
926           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
927 #else
928           data_i[isz_i] = val;
929 #endif
930           isz_i++;
931         }
932       }
933     }
934     isz[i] = isz_i;
935 
936 #if defined(PETSC_USE_CTABLE)
937     PetscCall(PetscFree(tdata));
938 #endif
939   }
940   PetscFunctionReturn(PETSC_SUCCESS);
941 }
942 
943 /*
944       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
945          and return the output
946 
947          Input:
948            C    - the matrix
949            nrqr - no of messages being processed.
950            rbuf - an array of pointers to the received requests
951 
952          Output:
953            xdata - array of messages to be sent back
954            isz1  - size of each message
955 
956   For better efficiency perhaps we should malloc separately each xdata[i],
957 then if a remalloc is required we need only copy the data for that one row
958 rather then all previous rows as it is now where a single large chunk of
959 memory is used.
960 
961 */
962 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
963 {
964   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
965   Mat         A = c->A, B = c->B;
966   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
967   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
968   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
969   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
970   PetscInt   *rbuf_i, kmax, rbuf_0;
971   PetscBT     xtable;
972 
973   PetscFunctionBegin;
974   m      = C->rmap->N;
975   rstart = C->rmap->rstart;
976   cstart = C->cmap->rstart;
977   ai     = a->i;
978   aj     = a->j;
979   bi     = b->i;
980   bj     = b->j;
981   garray = c->garray;
982 
983   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
984     rbuf_i = rbuf[i];
985     rbuf_0 = rbuf_i[0];
986     ct += rbuf_0;
987     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
988   }
989 
990   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
991   else max1 = 1;
992   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
993   if (nrqr) {
994     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
995     ++no_malloc;
996   }
997   PetscCall(PetscBTCreate(m, &xtable));
998   PetscCall(PetscArrayzero(isz1, nrqr));
999 
1000   ct3 = 0;
1001   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
1002     rbuf_i = rbuf[i];
1003     rbuf_0 = rbuf_i[0];
1004     ct1    = 2 * rbuf_0 + 1;
1005     ct2    = ct1;
1006     ct3 += ct1;
1007     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1008       PetscCall(PetscBTMemzero(m, xtable));
1009       oct2 = ct2;
1010       kmax = rbuf_i[2 * j];
1011       for (k = 0; k < kmax; k++, ct1++) {
1012         row = rbuf_i[ct1];
1013         if (!PetscBTLookupSet(xtable, row)) {
1014           if (!(ct3 < mem_estimate)) {
1015             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1016             PetscCall(PetscMalloc1(new_estimate, &tmp));
1017             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1018             PetscCall(PetscFree(xdata[0]));
1019             xdata[0]     = tmp;
1020             mem_estimate = new_estimate;
1021             ++no_malloc;
1022             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1023           }
1024           xdata[i][ct2++] = row;
1025           ct3++;
1026         }
1027       }
1028       for (k = oct2, max2 = ct2; k < max2; k++) {
1029         row   = xdata[i][k] - rstart;
1030         start = ai[row];
1031         end   = ai[row + 1];
1032         for (l = start; l < end; l++) {
1033           val = aj[l] + cstart;
1034           if (!PetscBTLookupSet(xtable, val)) {
1035             if (!(ct3 < mem_estimate)) {
1036               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1037               PetscCall(PetscMalloc1(new_estimate, &tmp));
1038               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1039               PetscCall(PetscFree(xdata[0]));
1040               xdata[0]     = tmp;
1041               mem_estimate = new_estimate;
1042               ++no_malloc;
1043               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1044             }
1045             xdata[i][ct2++] = val;
1046             ct3++;
1047           }
1048         }
1049         start = bi[row];
1050         end   = bi[row + 1];
1051         for (l = start; l < end; l++) {
1052           val = garray[bj[l]];
1053           if (!PetscBTLookupSet(xtable, val)) {
1054             if (!(ct3 < mem_estimate)) {
1055               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1056               PetscCall(PetscMalloc1(new_estimate, &tmp));
1057               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1058               PetscCall(PetscFree(xdata[0]));
1059               xdata[0]     = tmp;
1060               mem_estimate = new_estimate;
1061               ++no_malloc;
1062               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1063             }
1064             xdata[i][ct2++] = val;
1065             ct3++;
1066           }
1067         }
1068       }
1069       /* Update the header*/
1070       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1071       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1072     }
1073     xdata[i][0] = rbuf_0;
1074     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1075     isz1[i] = ct2; /* size of each message */
1076   }
1077   PetscCall(PetscBTDestroy(&xtable));
1078   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1079   PetscFunctionReturn(PETSC_SUCCESS);
1080 }
1081 
1082 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1083 /*
1084     Every processor gets the entire matrix
1085 */
1086 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1087 {
1088   Mat         B;
1089   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1090   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1091   PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1092   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1093   PetscInt    m, *b_sendj, *garray   = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1094 
1095   PetscFunctionBegin;
1096   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1097   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1098   if (scall == MAT_INITIAL_MATRIX) {
1099     /* Tell every processor the number of nonzeros per row */
1100     PetscCall(PetscMalloc1(A->rmap->N, &lens));
1101     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1102     PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1103     for (i = 0; i < size; i++) {
1104       recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1105       displs[i]     = A->rmap->range[i];
1106     }
1107     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1108     /*     Create the sequential matrix of the same type as the local block diagonal  */
1109     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1110     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1111     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1112     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1113     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1114     PetscCall(PetscCalloc1(2, Bin));
1115     **Bin = B;
1116     b     = (Mat_SeqAIJ *)B->data;
1117 
1118     /*   Copy my part of matrix column indices over    */
1119     sendcount  = ad->nz + bd->nz;
1120     jsendbuf   = b->j + b->i[rstarts[rank]];
1121     a_jsendbuf = ad->j;
1122     b_jsendbuf = bd->j;
1123     n          = A->rmap->rend - A->rmap->rstart;
1124     cnt        = 0;
1125     for (i = 0; i < n; i++) {
1126       /* put in lower diagonal portion */
1127       m = bd->i[i + 1] - bd->i[i];
1128       while (m > 0) {
1129         /* is it above diagonal (in bd (compressed) numbering) */
1130         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1131         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1132         m--;
1133       }
1134 
1135       /* put in diagonal portion */
1136       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1137 
1138       /* put in upper diagonal portion */
1139       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1140     }
1141     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1142 
1143     /*  Gather all column indices to all processors */
1144     for (i = 0; i < size; i++) {
1145       recvcounts[i] = 0;
1146       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1147     }
1148     displs[0] = 0;
1149     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1150     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1151     /*  Assemble the matrix into useable form (note numerical values not yet set) */
1152     /* set the b->ilen (length of each row) values */
1153     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1154     /* set the b->i indices */
1155     b->i[0] = 0;
1156     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1157     PetscCall(PetscFree(lens));
1158     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1159     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1160 
1161   } else {
1162     B = **Bin;
1163     b = (Mat_SeqAIJ *)B->data;
1164   }
1165 
1166   /* Copy my part of matrix numerical values into the values location */
1167   if (flag == MAT_GET_VALUES) {
1168     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1169     MatScalar         *sendbuf, *recvbuf;
1170 
1171     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1172     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1173     sendcount = ad->nz + bd->nz;
1174     sendbuf   = b->a + b->i[rstarts[rank]];
1175     a_sendbuf = ada;
1176     b_sendbuf = bda;
1177     b_sendj   = bd->j;
1178     n         = A->rmap->rend - A->rmap->rstart;
1179     cnt       = 0;
1180     for (i = 0; i < n; i++) {
1181       /* put in lower diagonal portion */
1182       m = bd->i[i + 1] - bd->i[i];
1183       while (m > 0) {
1184         /* is it above diagonal (in bd (compressed) numbering) */
1185         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         m--;
1188         b_sendj++;
1189       }
1190 
1191       /* put in diagonal portion */
1192       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1193 
1194       /* put in upper diagonal portion */
1195       while (m-- > 0) {
1196         sendbuf[cnt++] = *b_sendbuf++;
1197         b_sendj++;
1198       }
1199     }
1200     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1201 
1202     /*  Gather all numerical values to all processors  */
1203     if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1204     for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1205     displs[0] = 0;
1206     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1207     recvbuf = b->a;
1208     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A)));
1209     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1210     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1211   } /* endof (flag == MAT_GET_VALUES) */
1212   PetscCall(PetscFree2(recvcounts, displs));
1213 
1214   PetscCall(MatPropagateSymmetryOptions(A, B));
1215   PetscFunctionReturn(PETSC_SUCCESS);
1216 }
1217 
1218 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1219 {
1220   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1221   Mat             submat, A = c->A, B = c->B;
1222   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1223   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1224   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1225   const PetscInt *icol, *irow;
1226   PetscInt        nrow, ncol, start;
1227   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1228   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1229   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1230   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1231   PetscInt       *lens, rmax, ncols, *cols, Crow;
1232 #if defined(PETSC_USE_CTABLE)
1233   PetscHMapI cmap, rmap;
1234   PetscInt  *cmap_loc, *rmap_loc;
1235 #else
1236   PetscInt *cmap, *rmap;
1237 #endif
1238   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1239   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1240   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1241   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1242   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1243   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1244   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1245   MPI_Comm      comm;
1246   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1247   PetscMPIInt  *onodes1, *olengths1, idex, end;
1248   Mat_SubSppt  *smatis1;
1249   PetscBool     isrowsorted, iscolsorted;
1250 
1251   PetscFunctionBegin;
1252   PetscValidLogicalCollectiveInt(C, ismax, 2);
1253   PetscValidLogicalCollectiveEnum(C, scall, 5);
1254   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1255   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1256   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1257   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1258   size = c->size;
1259   rank = c->rank;
1260 
1261   PetscCall(ISSorted(iscol[0], &iscolsorted));
1262   PetscCall(ISSorted(isrow[0], &isrowsorted));
1263   PetscCall(ISGetIndices(isrow[0], &irow));
1264   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1265   if (allcolumns) {
1266     icol = NULL;
1267     ncol = C->cmap->N;
1268   } else {
1269     PetscCall(ISGetIndices(iscol[0], &icol));
1270     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1271   }
1272 
1273   if (scall == MAT_INITIAL_MATRIX) {
1274     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1275 
1276     /* Get some new tags to keep the communication clean */
1277     tag1 = ((PetscObject)C)->tag;
1278     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1279     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1280 
1281     /* evaluate communication - mesg to who, length of mesg, and buffer space
1282      required. Based on this, buffers are allocated, and data copied into them */
1283     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1284     PetscCall(PetscMalloc1(nrow, &row2proc));
1285 
1286     /* w1[proc] = num of rows owned by proc -- to be requested */
1287     proc = 0;
1288     nrqs = 0; /* num of outgoing messages */
1289     for (j = 0; j < nrow; j++) {
1290       row = irow[j];
1291       if (!isrowsorted) proc = 0;
1292       while (row >= C->rmap->range[proc + 1]) proc++;
1293       w1[proc]++;
1294       row2proc[j] = proc; /* map row index to proc */
1295 
1296       if (proc != rank && !w2[proc]) {
1297         w2[proc] = 1;
1298         nrqs++;
1299       }
1300     }
1301     w1[rank] = 0; /* rows owned by self will not be requested */
1302 
1303     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1304     for (proc = 0, j = 0; proc < size; proc++) {
1305       if (w1[proc]) pa[j++] = proc;
1306     }
1307 
1308     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1309     msz = 0; /* total mesg length (for all procs) */
1310     for (i = 0; i < nrqs; i++) {
1311       proc = pa[i];
1312       w1[proc] += 3;
1313       msz += w1[proc];
1314     }
1315     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
1316 
1317     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1318     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1319     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1320 
1321     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1322        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1323     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1324 
1325     /* Now post the Irecvs corresponding to these messages */
1326     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1327 
1328     PetscCall(PetscFree(onodes1));
1329     PetscCall(PetscFree(olengths1));
1330 
1331     /* Allocate Memory for outgoing messages */
1332     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1333     PetscCall(PetscArrayzero(sbuf1, size));
1334     PetscCall(PetscArrayzero(ptr, size));
1335 
1336     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1337     iptr = tmp;
1338     for (i = 0; i < nrqs; i++) {
1339       proc        = pa[i];
1340       sbuf1[proc] = iptr;
1341       iptr += w1[proc];
1342     }
1343 
1344     /* Form the outgoing messages */
1345     /* Initialize the header space */
1346     for (i = 0; i < nrqs; i++) {
1347       proc = pa[i];
1348       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1349       ptr[proc] = sbuf1[proc] + 3;
1350     }
1351 
1352     /* Parse the isrow and copy data into outbuf */
1353     PetscCall(PetscArrayzero(ctr, size));
1354     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1355       proc = row2proc[j];
1356       if (proc != rank) { /* copy to the outgoing buf*/
1357         *ptr[proc] = irow[j];
1358         ctr[proc]++;
1359         ptr[proc]++;
1360       }
1361     }
1362 
1363     /* Update the headers for the current IS */
1364     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1365       if ((ctr_j = ctr[j])) {
1366         sbuf1_j            = sbuf1[j];
1367         k                  = ++sbuf1_j[0];
1368         sbuf1_j[2 * k]     = ctr_j;
1369         sbuf1_j[2 * k - 1] = 0;
1370       }
1371     }
1372 
1373     /* Now post the sends */
1374     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1375     for (i = 0; i < nrqs; ++i) {
1376       proc = pa[i];
1377       PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1378     }
1379 
1380     /* Post Receives to capture the buffer size */
1381     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1382     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1383 
1384     if (nrqs) rbuf2[0] = tmp + msz;
1385     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1386 
1387     for (i = 0; i < nrqs; ++i) {
1388       proc = pa[i];
1389       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1390     }
1391 
1392     PetscCall(PetscFree2(w1, w2));
1393 
1394     /* Send to other procs the buf size they should allocate */
1395     /* Receive messages*/
1396     PetscCall(PetscMalloc1(nrqr, &r_status1));
1397     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1398 
1399     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1400     for (i = 0; i < nrqr; ++i) {
1401       req_size[i] = 0;
1402       rbuf1_i     = rbuf1[i];
1403       start       = 2 * rbuf1_i[0] + 1;
1404       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1405       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1406       sbuf2_i = sbuf2[i];
1407       for (j = start; j < end; j++) {
1408         k          = rbuf1_i[j] - rstart;
1409         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1410         sbuf2_i[j] = ncols;
1411         req_size[i] += ncols;
1412       }
1413       req_source1[i] = r_status1[i].MPI_SOURCE;
1414 
1415       /* form the header */
1416       sbuf2_i[0] = req_size[i];
1417       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1418 
1419       PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1420     }
1421 
1422     PetscCall(PetscFree(r_status1));
1423     PetscCall(PetscFree(r_waits1));
1424 
1425     /* rbuf2 is received, Post recv column indices a->j */
1426     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1427 
1428     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1429     for (i = 0; i < nrqs; ++i) {
1430       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1431       req_source2[i] = r_status2[i].MPI_SOURCE;
1432       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1433     }
1434 
1435     /* Wait on sends1 and sends2 */
1436     PetscCall(PetscMalloc1(nrqs, &s_status1));
1437     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1438     PetscCall(PetscFree(s_waits1));
1439     PetscCall(PetscFree(s_status1));
1440 
1441     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1442     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1443 
1444     /* Now allocate sending buffers for a->j, and send them off */
1445     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1446     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1447     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1448     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1449 
1450     for (i = 0; i < nrqr; i++) { /* for each requested message */
1451       rbuf1_i   = rbuf1[i];
1452       sbuf_aj_i = sbuf_aj[i];
1453       ct1       = 2 * rbuf1_i[0] + 1;
1454       ct2       = 0;
1455       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1456 
1457       kmax = rbuf1[i][2];
1458       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1459         row    = rbuf1_i[ct1] - rstart;
1460         nzA    = ai[row + 1] - ai[row];
1461         nzB    = bi[row + 1] - bi[row];
1462         ncols  = nzA + nzB;
1463         cworkA = aj + ai[row];
1464         cworkB = bj + bi[row];
1465 
1466         /* load the column indices for this row into cols*/
1467         cols = sbuf_aj_i + ct2;
1468 
1469         lwrite = 0;
1470         for (l = 0; l < nzB; l++) {
1471           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1472         }
1473         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1474         for (l = 0; l < nzB; l++) {
1475           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1476         }
1477 
1478         ct2 += ncols;
1479       }
1480       PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1481     }
1482 
1483     /* create column map (cmap): global col of C -> local col of submat */
1484 #if defined(PETSC_USE_CTABLE)
1485     if (!allcolumns) {
1486       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1487       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1488       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1489         if (icol[j] >= cstart && icol[j] < cend) {
1490           cmap_loc[icol[j] - cstart] = j + 1;
1491         } else { /* use PetscHMapI for non-local col indices */
1492           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1493         }
1494       }
1495     } else {
1496       cmap     = NULL;
1497       cmap_loc = NULL;
1498     }
1499     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1500 #else
1501     if (!allcolumns) {
1502       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1503       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1504     } else {
1505       cmap = NULL;
1506     }
1507 #endif
1508 
1509     /* Create lens for MatSeqAIJSetPreallocation() */
1510     PetscCall(PetscCalloc1(nrow, &lens));
1511 
1512     /* Compute lens from local part of C */
1513     for (j = 0; j < nrow; j++) {
1514       row  = irow[j];
1515       proc = row2proc[j];
1516       if (proc == rank) {
1517         /* diagonal part A = c->A */
1518         ncols = ai[row - rstart + 1] - ai[row - rstart];
1519         cols  = aj + ai[row - rstart];
1520         if (!allcolumns) {
1521           for (k = 0; k < ncols; k++) {
1522 #if defined(PETSC_USE_CTABLE)
1523             tcol = cmap_loc[cols[k]];
1524 #else
1525             tcol = cmap[cols[k] + cstart];
1526 #endif
1527             if (tcol) lens[j]++;
1528           }
1529         } else { /* allcolumns */
1530           lens[j] = ncols;
1531         }
1532 
1533         /* off-diagonal part B = c->B */
1534         ncols = bi[row - rstart + 1] - bi[row - rstart];
1535         cols  = bj + bi[row - rstart];
1536         if (!allcolumns) {
1537           for (k = 0; k < ncols; k++) {
1538 #if defined(PETSC_USE_CTABLE)
1539             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1540 #else
1541             tcol = cmap[bmap[cols[k]]];
1542 #endif
1543             if (tcol) lens[j]++;
1544           }
1545         } else { /* allcolumns */
1546           lens[j] += ncols;
1547         }
1548       }
1549     }
1550 
1551     /* Create row map (rmap): global row of C -> local row of submat */
1552 #if defined(PETSC_USE_CTABLE)
1553     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1554     for (j = 0; j < nrow; j++) {
1555       row  = irow[j];
1556       proc = row2proc[j];
1557       if (proc == rank) { /* a local row */
1558         rmap_loc[row - rstart] = j;
1559       } else {
1560         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1561       }
1562     }
1563 #else
1564     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1565     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1566 #endif
1567 
1568     /* Update lens from offproc data */
1569     /* recv a->j is done */
1570     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1571     for (i = 0; i < nrqs; i++) {
1572       proc    = pa[i];
1573       sbuf1_i = sbuf1[proc];
1574       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1575       ct1     = 2 + 1;
1576       ct2     = 0;
1577       rbuf2_i = rbuf2[i]; /* received length of C->j */
1578       rbuf3_i = rbuf3[i]; /* received C->j */
1579 
1580       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1581       max1 = sbuf1_i[2];
1582       for (k = 0; k < max1; k++, ct1++) {
1583 #if defined(PETSC_USE_CTABLE)
1584         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1585         row--;
1586         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1587 #else
1588         row = rmap[sbuf1_i[ct1]];      /* the row index in submat */
1589 #endif
1590         /* Now, store row index of submat in sbuf1_i[ct1] */
1591         sbuf1_i[ct1] = row;
1592 
1593         nnz = rbuf2_i[ct1];
1594         if (!allcolumns) {
1595           for (l = 0; l < nnz; l++, ct2++) {
1596 #if defined(PETSC_USE_CTABLE)
1597             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1598               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1599             } else {
1600               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1601             }
1602 #else
1603             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1604 #endif
1605             if (tcol) lens[row]++;
1606           }
1607         } else { /* allcolumns */
1608           lens[row] += nnz;
1609         }
1610       }
1611     }
1612     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1613     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1614 
1615     /* Create the submatrices */
1616     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1617     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1618 
1619     PetscCall(ISGetBlockSize(isrow[0], &i));
1620     PetscCall(ISGetBlockSize(iscol[0], &j));
1621     PetscCall(MatSetBlockSizes(submat, i, j));
1622     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1623     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1624 
1625     /* create struct Mat_SubSppt and attached it to submat */
1626     PetscCall(PetscNew(&smatis1));
1627     subc            = (Mat_SeqAIJ *)submat->data;
1628     subc->submatis1 = smatis1;
1629 
1630     smatis1->id          = 0;
1631     smatis1->nrqs        = nrqs;
1632     smatis1->nrqr        = nrqr;
1633     smatis1->rbuf1       = rbuf1;
1634     smatis1->rbuf2       = rbuf2;
1635     smatis1->rbuf3       = rbuf3;
1636     smatis1->sbuf2       = sbuf2;
1637     smatis1->req_source2 = req_source2;
1638 
1639     smatis1->sbuf1 = sbuf1;
1640     smatis1->ptr   = ptr;
1641     smatis1->tmp   = tmp;
1642     smatis1->ctr   = ctr;
1643 
1644     smatis1->pa          = pa;
1645     smatis1->req_size    = req_size;
1646     smatis1->req_source1 = req_source1;
1647 
1648     smatis1->allcolumns = allcolumns;
1649     smatis1->singleis   = PETSC_TRUE;
1650     smatis1->row2proc   = row2proc;
1651     smatis1->rmap       = rmap;
1652     smatis1->cmap       = cmap;
1653 #if defined(PETSC_USE_CTABLE)
1654     smatis1->rmap_loc = rmap_loc;
1655     smatis1->cmap_loc = cmap_loc;
1656 #endif
1657 
1658     smatis1->destroy     = submat->ops->destroy;
1659     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1660     submat->factortype   = C->factortype;
1661 
1662     /* compute rmax */
1663     rmax = 0;
1664     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1665 
1666   } else { /* scall == MAT_REUSE_MATRIX */
1667     submat = submats[0];
1668     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1669 
1670     subc        = (Mat_SeqAIJ *)submat->data;
1671     rmax        = subc->rmax;
1672     smatis1     = subc->submatis1;
1673     nrqs        = smatis1->nrqs;
1674     nrqr        = smatis1->nrqr;
1675     rbuf1       = smatis1->rbuf1;
1676     rbuf2       = smatis1->rbuf2;
1677     rbuf3       = smatis1->rbuf3;
1678     req_source2 = smatis1->req_source2;
1679 
1680     sbuf1 = smatis1->sbuf1;
1681     sbuf2 = smatis1->sbuf2;
1682     ptr   = smatis1->ptr;
1683     tmp   = smatis1->tmp;
1684     ctr   = smatis1->ctr;
1685 
1686     pa          = smatis1->pa;
1687     req_size    = smatis1->req_size;
1688     req_source1 = smatis1->req_source1;
1689 
1690     allcolumns = smatis1->allcolumns;
1691     row2proc   = smatis1->row2proc;
1692     rmap       = smatis1->rmap;
1693     cmap       = smatis1->cmap;
1694 #if defined(PETSC_USE_CTABLE)
1695     rmap_loc = smatis1->rmap_loc;
1696     cmap_loc = smatis1->cmap_loc;
1697 #endif
1698   }
1699 
1700   /* Post recv matrix values */
1701   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1702   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1703   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1704   for (i = 0; i < nrqs; ++i) {
1705     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1706     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1707   }
1708 
1709   /* Allocate sending buffers for a->a, and send them off */
1710   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1711   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1712   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1713   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1714 
1715   for (i = 0; i < nrqr; i++) {
1716     rbuf1_i   = rbuf1[i];
1717     sbuf_aa_i = sbuf_aa[i];
1718     ct1       = 2 * rbuf1_i[0] + 1;
1719     ct2       = 0;
1720     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1721 
1722     kmax = rbuf1_i[2];
1723     for (k = 0; k < kmax; k++, ct1++) {
1724       row    = rbuf1_i[ct1] - rstart;
1725       nzA    = ai[row + 1] - ai[row];
1726       nzB    = bi[row + 1] - bi[row];
1727       ncols  = nzA + nzB;
1728       cworkB = bj + bi[row];
1729       vworkA = a_a + ai[row];
1730       vworkB = b_a + bi[row];
1731 
1732       /* load the column values for this row into vals*/
1733       vals = sbuf_aa_i + ct2;
1734 
1735       lwrite = 0;
1736       for (l = 0; l < nzB; l++) {
1737         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1738       }
1739       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1740       for (l = 0; l < nzB; l++) {
1741         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1742       }
1743 
1744       ct2 += ncols;
1745     }
1746     PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1747   }
1748 
1749   /* Assemble submat */
1750   /* First assemble the local rows */
1751   for (j = 0; j < nrow; j++) {
1752     row  = irow[j];
1753     proc = row2proc[j];
1754     if (proc == rank) {
1755       Crow = row - rstart; /* local row index of C */
1756 #if defined(PETSC_USE_CTABLE)
1757       row = rmap_loc[Crow]; /* row index of submat */
1758 #else
1759       row = rmap[row];
1760 #endif
1761 
1762       if (allcolumns) {
1763         /* diagonal part A = c->A */
1764         ncols = ai[Crow + 1] - ai[Crow];
1765         cols  = aj + ai[Crow];
1766         vals  = a_a + ai[Crow];
1767         i     = 0;
1768         for (k = 0; k < ncols; k++) {
1769           subcols[i]   = cols[k] + cstart;
1770           subvals[i++] = vals[k];
1771         }
1772 
1773         /* off-diagonal part B = c->B */
1774         ncols = bi[Crow + 1] - bi[Crow];
1775         cols  = bj + bi[Crow];
1776         vals  = b_a + bi[Crow];
1777         for (k = 0; k < ncols; k++) {
1778           subcols[i]   = bmap[cols[k]];
1779           subvals[i++] = vals[k];
1780         }
1781 
1782         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1783 
1784       } else { /* !allcolumns */
1785 #if defined(PETSC_USE_CTABLE)
1786         /* diagonal part A = c->A */
1787         ncols = ai[Crow + 1] - ai[Crow];
1788         cols  = aj + ai[Crow];
1789         vals  = a_a + ai[Crow];
1790         i     = 0;
1791         for (k = 0; k < ncols; k++) {
1792           tcol = cmap_loc[cols[k]];
1793           if (tcol) {
1794             subcols[i]   = --tcol;
1795             subvals[i++] = vals[k];
1796           }
1797         }
1798 
1799         /* off-diagonal part B = c->B */
1800         ncols = bi[Crow + 1] - bi[Crow];
1801         cols  = bj + bi[Crow];
1802         vals  = b_a + bi[Crow];
1803         for (k = 0; k < ncols; k++) {
1804           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1805           if (tcol) {
1806             subcols[i]   = --tcol;
1807             subvals[i++] = vals[k];
1808           }
1809         }
1810 #else
1811         /* diagonal part A = c->A */
1812         ncols = ai[Crow + 1] - ai[Crow];
1813         cols  = aj + ai[Crow];
1814         vals  = a_a + ai[Crow];
1815         i     = 0;
1816         for (k = 0; k < ncols; k++) {
1817           tcol = cmap[cols[k] + cstart];
1818           if (tcol) {
1819             subcols[i]   = --tcol;
1820             subvals[i++] = vals[k];
1821           }
1822         }
1823 
1824         /* off-diagonal part B = c->B */
1825         ncols = bi[Crow + 1] - bi[Crow];
1826         cols  = bj + bi[Crow];
1827         vals  = b_a + bi[Crow];
1828         for (k = 0; k < ncols; k++) {
1829           tcol = cmap[bmap[cols[k]]];
1830           if (tcol) {
1831             subcols[i]   = --tcol;
1832             subvals[i++] = vals[k];
1833           }
1834         }
1835 #endif
1836         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1837       }
1838     }
1839   }
1840 
1841   /* Now assemble the off-proc rows */
1842   for (i = 0; i < nrqs; i++) { /* for each requested message */
1843     /* recv values from other processes */
1844     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1845     proc    = pa[idex];
1846     sbuf1_i = sbuf1[proc];
1847     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1848     ct1     = 2 + 1;
1849     ct2     = 0;           /* count of received C->j */
1850     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1851     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1852     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1853     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1854 
1855     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1856     max1 = sbuf1_i[2];                  /* num of rows */
1857     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1858       row = sbuf1_i[ct1];               /* row index of submat */
1859       if (!allcolumns) {
1860         idex = 0;
1861         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1862           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1863           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1864 #if defined(PETSC_USE_CTABLE)
1865             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1866               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1867             } else {
1868               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1869             }
1870 #else
1871             tcol = cmap[rbuf3_i[ct2]];
1872 #endif
1873             if (tcol) {
1874               subcols[idex]   = --tcol; /* may not be sorted */
1875               subvals[idex++] = rbuf4_i[ct2];
1876 
1877               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1878                For reuse, we replace received C->j with index that should be inserted to submat */
1879               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1880             }
1881           }
1882           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1883         } else { /* scall == MAT_REUSE_MATRIX */
1884           submat = submats[0];
1885           subc   = (Mat_SeqAIJ *)submat->data;
1886 
1887           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1888           for (l = 0; l < nnz; l++) {
1889             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1890             subvals[idex++] = rbuf4_i[ct2];
1891           }
1892 
1893           bj = subc->j + subc->i[row]; /* sorted column indices */
1894           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1895         }
1896       } else {              /* allcolumns */
1897         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1898         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES));
1899         ct2 += nnz;
1900       }
1901     }
1902   }
1903 
1904   /* sending a->a are done */
1905   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1906   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1907 
1908   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1909   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1910   submats[0] = submat;
1911 
1912   /* Restore the indices */
1913   PetscCall(ISRestoreIndices(isrow[0], &irow));
1914   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1915 
1916   /* Destroy allocated memory */
1917   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1918   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1919   if (sbuf_aa) {
1920     PetscCall(PetscFree(sbuf_aa[0]));
1921     PetscCall(PetscFree(sbuf_aa));
1922   }
1923 
1924   if (scall == MAT_INITIAL_MATRIX) {
1925     PetscCall(PetscFree(lens));
1926     if (sbuf_aj) {
1927       PetscCall(PetscFree(sbuf_aj[0]));
1928       PetscCall(PetscFree(sbuf_aj));
1929     }
1930   }
1931   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1932   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1937 {
1938   PetscInt  ncol;
1939   PetscBool colflag, allcolumns = PETSC_FALSE;
1940 
1941   PetscFunctionBegin;
1942   /* Allocate memory to hold all the submatrices */
1943   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1944 
1945   /* Check for special case: each processor gets entire matrix columns */
1946   PetscCall(ISIdentity(iscol[0], &colflag));
1947   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1948   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1949 
1950   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1951   PetscFunctionReturn(PETSC_SUCCESS);
1952 }
1953 
1954 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1955 {
1956   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1957   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1958   Mat_SeqAIJ  *subc;
1959   Mat_SubSppt *smat;
1960 
1961   PetscFunctionBegin;
1962   /* Check for special case: each processor has a single IS */
1963   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1964     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1965     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1966     PetscFunctionReturn(PETSC_SUCCESS);
1967   }
1968 
1969   /* Collect global wantallmatrix and nstages */
1970   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1971   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1972   if (!nmax) nmax = 1;
1973 
1974   if (scall == MAT_INITIAL_MATRIX) {
1975     /* Collect global wantallmatrix and nstages */
1976     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1977       PetscCall(ISIdentity(*isrow, &rowflag));
1978       PetscCall(ISIdentity(*iscol, &colflag));
1979       PetscCall(ISGetLocalSize(*isrow, &nrow));
1980       PetscCall(ISGetLocalSize(*iscol, &ncol));
1981       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1982         wantallmatrix = PETSC_TRUE;
1983 
1984         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1985       }
1986     }
1987 
1988     /* Determine the number of stages through which submatrices are done
1989        Each stage will extract nmax submatrices.
1990        nmax is determined by the matrix column dimension.
1991        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1992     */
1993     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1994 
1995     in[0] = -1 * (PetscInt)wantallmatrix;
1996     in[1] = nstages;
1997     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1998     wantallmatrix = (PetscBool)(-out[0]);
1999     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2000 
2001   } else { /* MAT_REUSE_MATRIX */
2002     if (ismax) {
2003       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2004       smat = subc->submatis1;
2005     } else { /* (*submat)[0] is a dummy matrix */
2006       smat = (Mat_SubSppt *)(*submat)[0]->data;
2007     }
2008     if (!smat) {
2009       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2010       wantallmatrix = PETSC_TRUE;
2011     } else if (smat->singleis) {
2012       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2013       PetscFunctionReturn(PETSC_SUCCESS);
2014     } else {
2015       nstages = smat->nstages;
2016     }
2017   }
2018 
2019   if (wantallmatrix) {
2020     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2021     PetscFunctionReturn(PETSC_SUCCESS);
2022   }
2023 
2024   /* Allocate memory to hold all the submatrices and dummy submatrices */
2025   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2026 
2027   for (i = 0, pos = 0; i < nstages; i++) {
2028     if (pos + nmax <= ismax) max_no = nmax;
2029     else if (pos >= ismax) max_no = 0;
2030     else max_no = ismax - pos;
2031 
2032     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos));
2033     if (!max_no) {
2034       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2035         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2036         smat->nstages = nstages;
2037       }
2038       pos++; /* advance to next dummy matrix if any */
2039     } else pos += max_no;
2040   }
2041 
2042   if (ismax && scall == MAT_INITIAL_MATRIX) {
2043     /* save nstages for reuse */
2044     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2045     smat          = subc->submatis1;
2046     smat->nstages = nstages;
2047   }
2048   PetscFunctionReturn(PETSC_SUCCESS);
2049 }
2050 
2051 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2052 {
2053   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2054   Mat              A = c->A;
2055   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2056   const PetscInt **icol, **irow;
2057   PetscInt        *nrow, *ncol, start;
2058   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2059   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2060   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2061   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2062   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2063 #if defined(PETSC_USE_CTABLE)
2064   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2065 #else
2066   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2067 #endif
2068   const PetscInt *irow_i;
2069   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2070   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2071   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2072   MPI_Comm        comm;
2073   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2074   PetscMPIInt    *onodes1, *olengths1, end;
2075   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2076   Mat_SubSppt    *smat_i;
2077   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2078   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2079 
2080   PetscFunctionBegin;
2081   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2082   size = c->size;
2083   rank = c->rank;
2084 
2085   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2086   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2087 
2088   for (i = 0; i < ismax; i++) {
2089     PetscCall(ISSorted(iscol[i], &issorted[i]));
2090     if (!issorted[i]) iscsorted = issorted[i];
2091 
2092     PetscCall(ISSorted(isrow[i], &issorted[i]));
2093 
2094     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2095     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2096 
2097     /* Check for special case: allcolumn */
2098     PetscCall(ISIdentity(iscol[i], &colflag));
2099     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2100     if (colflag && ncol[i] == C->cmap->N) {
2101       allcolumns[i] = PETSC_TRUE;
2102       icol[i]       = NULL;
2103     } else {
2104       allcolumns[i] = PETSC_FALSE;
2105       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2106     }
2107   }
2108 
2109   if (scall == MAT_REUSE_MATRIX) {
2110     /* Assumes new rows are same length as the old rows */
2111     for (i = 0; i < ismax; i++) {
2112       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2113       subc = (Mat_SeqAIJ *)submats[i]->data;
2114       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2115 
2116       /* Initial matrix as if empty */
2117       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2118 
2119       smat_i = subc->submatis1;
2120 
2121       nrqs        = smat_i->nrqs;
2122       nrqr        = smat_i->nrqr;
2123       rbuf1       = smat_i->rbuf1;
2124       rbuf2       = smat_i->rbuf2;
2125       rbuf3       = smat_i->rbuf3;
2126       req_source2 = smat_i->req_source2;
2127 
2128       sbuf1 = smat_i->sbuf1;
2129       sbuf2 = smat_i->sbuf2;
2130       ptr   = smat_i->ptr;
2131       tmp   = smat_i->tmp;
2132       ctr   = smat_i->ctr;
2133 
2134       pa          = smat_i->pa;
2135       req_size    = smat_i->req_size;
2136       req_source1 = smat_i->req_source1;
2137 
2138       allcolumns[i] = smat_i->allcolumns;
2139       row2proc[i]   = smat_i->row2proc;
2140       rmap[i]       = smat_i->rmap;
2141       cmap[i]       = smat_i->cmap;
2142     }
2143 
2144     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2145       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2146       smat_i = (Mat_SubSppt *)submats[0]->data;
2147 
2148       nrqs        = smat_i->nrqs;
2149       nrqr        = smat_i->nrqr;
2150       rbuf1       = smat_i->rbuf1;
2151       rbuf2       = smat_i->rbuf2;
2152       rbuf3       = smat_i->rbuf3;
2153       req_source2 = smat_i->req_source2;
2154 
2155       sbuf1 = smat_i->sbuf1;
2156       sbuf2 = smat_i->sbuf2;
2157       ptr   = smat_i->ptr;
2158       tmp   = smat_i->tmp;
2159       ctr   = smat_i->ctr;
2160 
2161       pa          = smat_i->pa;
2162       req_size    = smat_i->req_size;
2163       req_source1 = smat_i->req_source1;
2164 
2165       allcolumns[0] = PETSC_FALSE;
2166     }
2167   } else { /* scall == MAT_INITIAL_MATRIX */
2168     /* Get some new tags to keep the communication clean */
2169     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2170     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2171 
2172     /* evaluate communication - mesg to who, length of mesg, and buffer space
2173      required. Based on this, buffers are allocated, and data copied into them*/
2174     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2175 
2176     for (i = 0; i < ismax; i++) {
2177       jmax   = nrow[i];
2178       irow_i = irow[i];
2179 
2180       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2181       row2proc[i] = row2proc_i;
2182 
2183       if (issorted[i]) proc = 0;
2184       for (j = 0; j < jmax; j++) {
2185         if (!issorted[i]) proc = 0;
2186         row = irow_i[j];
2187         while (row >= C->rmap->range[proc + 1]) proc++;
2188         w4[proc]++;
2189         row2proc_i[j] = proc; /* map row index to proc */
2190       }
2191       for (j = 0; j < size; j++) {
2192         if (w4[j]) {
2193           w1[j] += w4[j];
2194           w3[j]++;
2195           w4[j] = 0;
2196         }
2197       }
2198     }
2199 
2200     nrqs     = 0; /* no of outgoing messages */
2201     msz      = 0; /* total mesg length (for all procs) */
2202     w1[rank] = 0; /* no mesg sent to self */
2203     w3[rank] = 0;
2204     for (i = 0; i < size; i++) {
2205       if (w1[i]) {
2206         w2[i] = 1;
2207         nrqs++;
2208       } /* there exists a message to proc i */
2209     }
2210     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2211     for (i = 0, j = 0; i < size; i++) {
2212       if (w1[i]) {
2213         pa[j] = i;
2214         j++;
2215       }
2216     }
2217 
2218     /* Each message would have a header = 1 + 2*(no of IS) + data */
2219     for (i = 0; i < nrqs; i++) {
2220       j = pa[i];
2221       w1[j] += w2[j] + 2 * w3[j];
2222       msz += w1[j];
2223     }
2224     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
2225 
2226     /* Determine the number of messages to expect, their lengths, from from-ids */
2227     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2228     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2229 
2230     /* Now post the Irecvs corresponding to these messages */
2231     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2232     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2233 
2234     /* Allocate Memory for outgoing messages */
2235     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2236     PetscCall(PetscArrayzero(sbuf1, size));
2237     PetscCall(PetscArrayzero(ptr, size));
2238 
2239     {
2240       PetscInt *iptr = tmp;
2241       k              = 0;
2242       for (i = 0; i < nrqs; i++) {
2243         j = pa[i];
2244         iptr += k;
2245         sbuf1[j] = iptr;
2246         k        = w1[j];
2247       }
2248     }
2249 
2250     /* Form the outgoing messages. Initialize the header space */
2251     for (i = 0; i < nrqs; i++) {
2252       j           = pa[i];
2253       sbuf1[j][0] = 0;
2254       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2255       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2256     }
2257 
2258     /* Parse the isrow and copy data into outbuf */
2259     for (i = 0; i < ismax; i++) {
2260       row2proc_i = row2proc[i];
2261       PetscCall(PetscArrayzero(ctr, size));
2262       irow_i = irow[i];
2263       jmax   = nrow[i];
2264       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2265         proc = row2proc_i[j];
2266         if (proc != rank) { /* copy to the outgoing buf*/
2267           ctr[proc]++;
2268           *ptr[proc] = irow_i[j];
2269           ptr[proc]++;
2270         }
2271       }
2272       /* Update the headers for the current IS */
2273       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2274         if ((ctr_j = ctr[j])) {
2275           sbuf1_j            = sbuf1[j];
2276           k                  = ++sbuf1_j[0];
2277           sbuf1_j[2 * k]     = ctr_j;
2278           sbuf1_j[2 * k - 1] = i;
2279         }
2280       }
2281     }
2282 
2283     /*  Now  post the sends */
2284     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2285     for (i = 0; i < nrqs; ++i) {
2286       j = pa[i];
2287       PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2288     }
2289 
2290     /* Post Receives to capture the buffer size */
2291     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2292     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2293     if (nrqs) rbuf2[0] = tmp + msz;
2294     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2295     for (i = 0; i < nrqs; ++i) {
2296       j = pa[i];
2297       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2298     }
2299 
2300     /* Send to other procs the buf size they should allocate */
2301     /* Receive messages*/
2302     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2303     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2304     {
2305       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2306       PetscInt *sbuf2_i;
2307 
2308       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2309       for (i = 0; i < nrqr; ++i) {
2310         req_size[i] = 0;
2311         rbuf1_i     = rbuf1[i];
2312         start       = 2 * rbuf1_i[0] + 1;
2313         end         = olengths1[i];
2314         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2315         sbuf2_i = sbuf2[i];
2316         for (j = start; j < end; j++) {
2317           id         = rbuf1_i[j] - rstart;
2318           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2319           sbuf2_i[j] = ncols;
2320           req_size[i] += ncols;
2321         }
2322         req_source1[i] = onodes1[i];
2323         /* form the header */
2324         sbuf2_i[0] = req_size[i];
2325         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2326 
2327         PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2328       }
2329     }
2330 
2331     PetscCall(PetscFree(onodes1));
2332     PetscCall(PetscFree(olengths1));
2333     PetscCall(PetscFree(r_waits1));
2334     PetscCall(PetscFree4(w1, w2, w3, w4));
2335 
2336     /* Receive messages*/
2337     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2338     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2339     for (i = 0; i < nrqs; ++i) {
2340       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2341       req_source2[i] = pa[i];
2342       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2343     }
2344     PetscCall(PetscFree(r_waits2));
2345 
2346     /* Wait on sends1 and sends2 */
2347     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2348     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2349     PetscCall(PetscFree(s_waits1));
2350     PetscCall(PetscFree(s_waits2));
2351 
2352     /* Now allocate sending buffers for a->j, and send them off */
2353     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2354     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2355     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2356     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2357 
2358     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2359     {
2360       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2361       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2362       PetscInt  cend = C->cmap->rend;
2363       PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2364 
2365       for (i = 0; i < nrqr; i++) {
2366         rbuf1_i   = rbuf1[i];
2367         sbuf_aj_i = sbuf_aj[i];
2368         ct1       = 2 * rbuf1_i[0] + 1;
2369         ct2       = 0;
2370         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2371           kmax = rbuf1[i][2 * j];
2372           for (k = 0; k < kmax; k++, ct1++) {
2373             row    = rbuf1_i[ct1] - rstart;
2374             nzA    = a_i[row + 1] - a_i[row];
2375             nzB    = b_i[row + 1] - b_i[row];
2376             ncols  = nzA + nzB;
2377             cworkA = a_j + a_i[row];
2378             cworkB = b_j + b_i[row];
2379 
2380             /* load the column indices for this row into cols */
2381             cols = sbuf_aj_i + ct2;
2382 
2383             lwrite = 0;
2384             for (l = 0; l < nzB; l++) {
2385               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2386             }
2387             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2388             for (l = 0; l < nzB; l++) {
2389               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2390             }
2391 
2392             ct2 += ncols;
2393           }
2394         }
2395         PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2396       }
2397     }
2398 
2399     /* create col map: global col of C -> local col of submatrices */
2400     {
2401       const PetscInt *icol_i;
2402 #if defined(PETSC_USE_CTABLE)
2403       for (i = 0; i < ismax; i++) {
2404         if (!allcolumns[i]) {
2405           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2406 
2407           jmax   = ncol[i];
2408           icol_i = icol[i];
2409           cmap_i = cmap[i];
2410           for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2411         } else cmap[i] = NULL;
2412       }
2413 #else
2414       for (i = 0; i < ismax; i++) {
2415         if (!allcolumns[i]) {
2416           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2417           jmax   = ncol[i];
2418           icol_i = icol[i];
2419           cmap_i = cmap[i];
2420           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2421         } else cmap[i] = NULL;
2422       }
2423 #endif
2424     }
2425 
2426     /* Create lens which is required for MatCreate... */
2427     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2428     PetscCall(PetscMalloc1(ismax, &lens));
2429 
2430     if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2431     for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];
2432 
2433     /* Update lens from local data */
2434     for (i = 0; i < ismax; i++) {
2435       row2proc_i = row2proc[i];
2436       jmax       = nrow[i];
2437       if (!allcolumns[i]) cmap_i = cmap[i];
2438       irow_i = irow[i];
2439       lens_i = lens[i];
2440       for (j = 0; j < jmax; j++) {
2441         row  = irow_i[j];
2442         proc = row2proc_i[j];
2443         if (proc == rank) {
2444           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2445           if (!allcolumns[i]) {
2446             for (k = 0; k < ncols; k++) {
2447 #if defined(PETSC_USE_CTABLE)
2448               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2449 #else
2450               tcol = cmap_i[cols[k]];
2451 #endif
2452               if (tcol) lens_i[j]++;
2453             }
2454           } else { /* allcolumns */
2455             lens_i[j] = ncols;
2456           }
2457           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2458         }
2459       }
2460     }
2461 
2462     /* Create row map: global row of C -> local row of submatrices */
2463 #if defined(PETSC_USE_CTABLE)
2464     for (i = 0; i < ismax; i++) {
2465       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2466       irow_i = irow[i];
2467       jmax   = nrow[i];
2468       for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2469     }
2470 #else
2471     for (i = 0; i < ismax; i++) {
2472       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2473       rmap_i = rmap[i];
2474       irow_i = irow[i];
2475       jmax   = nrow[i];
2476       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2477     }
2478 #endif
2479 
2480     /* Update lens from offproc data */
2481     {
2482       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2483 
2484       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2485       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2486         sbuf1_i = sbuf1[pa[tmp2]];
2487         jmax    = sbuf1_i[0];
2488         ct1     = 2 * jmax + 1;
2489         ct2     = 0;
2490         rbuf2_i = rbuf2[tmp2];
2491         rbuf3_i = rbuf3[tmp2];
2492         for (j = 1; j <= jmax; j++) {
2493           is_no  = sbuf1_i[2 * j - 1];
2494           max1   = sbuf1_i[2 * j];
2495           lens_i = lens[is_no];
2496           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2497           rmap_i = rmap[is_no];
2498           for (k = 0; k < max1; k++, ct1++) {
2499 #if defined(PETSC_USE_CTABLE)
2500             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2501             row--;
2502             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2503 #else
2504             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2505 #endif
2506             max2 = rbuf2_i[ct1];
2507             for (l = 0; l < max2; l++, ct2++) {
2508               if (!allcolumns[is_no]) {
2509 #if defined(PETSC_USE_CTABLE)
2510                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2511 #else
2512                 tcol = cmap_i[rbuf3_i[ct2]];
2513 #endif
2514                 if (tcol) lens_i[row]++;
2515               } else {         /* allcolumns */
2516                 lens_i[row]++; /* lens_i[row] += max2 ? */
2517               }
2518             }
2519           }
2520         }
2521       }
2522     }
2523     PetscCall(PetscFree(r_waits3));
2524     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2525     PetscCall(PetscFree(s_waits3));
2526 
2527     /* Create the submatrices */
2528     for (i = 0; i < ismax; i++) {
2529       PetscInt rbs, cbs;
2530 
2531       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2532       PetscCall(ISGetBlockSize(iscol[i], &cbs));
2533 
2534       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2535       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2536 
2537       PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2538       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2539       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2540 
2541       /* create struct Mat_SubSppt and attached it to submat */
2542       PetscCall(PetscNew(&smat_i));
2543       subc            = (Mat_SeqAIJ *)submats[i]->data;
2544       subc->submatis1 = smat_i;
2545 
2546       smat_i->destroy          = submats[i]->ops->destroy;
2547       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2548       submats[i]->factortype   = C->factortype;
2549 
2550       smat_i->id          = i;
2551       smat_i->nrqs        = nrqs;
2552       smat_i->nrqr        = nrqr;
2553       smat_i->rbuf1       = rbuf1;
2554       smat_i->rbuf2       = rbuf2;
2555       smat_i->rbuf3       = rbuf3;
2556       smat_i->sbuf2       = sbuf2;
2557       smat_i->req_source2 = req_source2;
2558 
2559       smat_i->sbuf1 = sbuf1;
2560       smat_i->ptr   = ptr;
2561       smat_i->tmp   = tmp;
2562       smat_i->ctr   = ctr;
2563 
2564       smat_i->pa          = pa;
2565       smat_i->req_size    = req_size;
2566       smat_i->req_source1 = req_source1;
2567 
2568       smat_i->allcolumns = allcolumns[i];
2569       smat_i->singleis   = PETSC_FALSE;
2570       smat_i->row2proc   = row2proc[i];
2571       smat_i->rmap       = rmap[i];
2572       smat_i->cmap       = cmap[i];
2573     }
2574 
2575     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2576       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2577       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2578       PetscCall(MatSetType(submats[0], MATDUMMY));
2579 
2580       /* create struct Mat_SubSppt and attached it to submat */
2581       PetscCall(PetscNew(&smat_i));
2582       submats[0]->data = (void *)smat_i;
2583 
2584       smat_i->destroy          = submats[0]->ops->destroy;
2585       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2586       submats[0]->factortype   = C->factortype;
2587 
2588       smat_i->id          = 0;
2589       smat_i->nrqs        = nrqs;
2590       smat_i->nrqr        = nrqr;
2591       smat_i->rbuf1       = rbuf1;
2592       smat_i->rbuf2       = rbuf2;
2593       smat_i->rbuf3       = rbuf3;
2594       smat_i->sbuf2       = sbuf2;
2595       smat_i->req_source2 = req_source2;
2596 
2597       smat_i->sbuf1 = sbuf1;
2598       smat_i->ptr   = ptr;
2599       smat_i->tmp   = tmp;
2600       smat_i->ctr   = ctr;
2601 
2602       smat_i->pa          = pa;
2603       smat_i->req_size    = req_size;
2604       smat_i->req_source1 = req_source1;
2605 
2606       smat_i->allcolumns = PETSC_FALSE;
2607       smat_i->singleis   = PETSC_FALSE;
2608       smat_i->row2proc   = NULL;
2609       smat_i->rmap       = NULL;
2610       smat_i->cmap       = NULL;
2611     }
2612 
2613     if (ismax) PetscCall(PetscFree(lens[0]));
2614     PetscCall(PetscFree(lens));
2615     if (sbuf_aj) {
2616       PetscCall(PetscFree(sbuf_aj[0]));
2617       PetscCall(PetscFree(sbuf_aj));
2618     }
2619 
2620   } /* endof scall == MAT_INITIAL_MATRIX */
2621 
2622   /* Post recv matrix values */
2623   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2624   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2625   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2626   for (i = 0; i < nrqs; ++i) {
2627     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2628     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2629   }
2630 
2631   /* Allocate sending buffers for a->a, and send them off */
2632   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2633   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2634   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2635   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2636 
2637   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2638   {
2639     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2640     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2641     PetscInt     cend = C->cmap->rend;
2642     PetscInt    *b_j  = b->j;
2643     PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2644 
2645     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2646     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2647     for (i = 0; i < nrqr; i++) {
2648       rbuf1_i   = rbuf1[i];
2649       sbuf_aa_i = sbuf_aa[i];
2650       ct1       = 2 * rbuf1_i[0] + 1;
2651       ct2       = 0;
2652       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2653         kmax = rbuf1_i[2 * j];
2654         for (k = 0; k < kmax; k++, ct1++) {
2655           row    = rbuf1_i[ct1] - rstart;
2656           nzA    = a_i[row + 1] - a_i[row];
2657           nzB    = b_i[row + 1] - b_i[row];
2658           ncols  = nzA + nzB;
2659           cworkB = b_j + b_i[row];
2660           vworkA = a_a + a_i[row];
2661           vworkB = b_a + b_i[row];
2662 
2663           /* load the column values for this row into vals*/
2664           vals = sbuf_aa_i + ct2;
2665 
2666           lwrite = 0;
2667           for (l = 0; l < nzB; l++) {
2668             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2669           }
2670           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2671           for (l = 0; l < nzB; l++) {
2672             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2673           }
2674 
2675           ct2 += ncols;
2676         }
2677       }
2678       PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2679     }
2680     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2681     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2682   }
2683 
2684   /* Assemble the matrices */
2685   /* First assemble the local rows */
2686   for (i = 0; i < ismax; i++) {
2687     row2proc_i = row2proc[i];
2688     subc       = (Mat_SeqAIJ *)submats[i]->data;
2689     imat_ilen  = subc->ilen;
2690     imat_j     = subc->j;
2691     imat_i     = subc->i;
2692     imat_a     = subc->a;
2693 
2694     if (!allcolumns[i]) cmap_i = cmap[i];
2695     rmap_i = rmap[i];
2696     irow_i = irow[i];
2697     jmax   = nrow[i];
2698     for (j = 0; j < jmax; j++) {
2699       row  = irow_i[j];
2700       proc = row2proc_i[j];
2701       if (proc == rank) {
2702         old_row = row;
2703 #if defined(PETSC_USE_CTABLE)
2704         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2705         row--;
2706 #else
2707         row = rmap_i[row];
2708 #endif
2709         ilen_row = imat_ilen[row];
2710         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2711         mat_i = imat_i[row];
2712         mat_a = imat_a + mat_i;
2713         mat_j = imat_j + mat_i;
2714         if (!allcolumns[i]) {
2715           for (k = 0; k < ncols; k++) {
2716 #if defined(PETSC_USE_CTABLE)
2717             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2718 #else
2719             tcol = cmap_i[cols[k]];
2720 #endif
2721             if (tcol) {
2722               *mat_j++ = tcol - 1;
2723               *mat_a++ = vals[k];
2724               ilen_row++;
2725             }
2726           }
2727         } else { /* allcolumns */
2728           for (k = 0; k < ncols; k++) {
2729             *mat_j++ = cols[k]; /* global col index! */
2730             *mat_a++ = vals[k];
2731             ilen_row++;
2732           }
2733         }
2734         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2735 
2736         imat_ilen[row] = ilen_row;
2737       }
2738     }
2739   }
2740 
2741   /* Now assemble the off proc rows */
2742   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2743   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2744     sbuf1_i = sbuf1[pa[tmp2]];
2745     jmax    = sbuf1_i[0];
2746     ct1     = 2 * jmax + 1;
2747     ct2     = 0;
2748     rbuf2_i = rbuf2[tmp2];
2749     rbuf3_i = rbuf3[tmp2];
2750     rbuf4_i = rbuf4[tmp2];
2751     for (j = 1; j <= jmax; j++) {
2752       is_no  = sbuf1_i[2 * j - 1];
2753       rmap_i = rmap[is_no];
2754       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2755       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2756       imat_ilen = subc->ilen;
2757       imat_j    = subc->j;
2758       imat_i    = subc->i;
2759       imat_a    = subc->a;
2760       max1      = sbuf1_i[2 * j];
2761       for (k = 0; k < max1; k++, ct1++) {
2762         row = sbuf1_i[ct1];
2763 #if defined(PETSC_USE_CTABLE)
2764         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2765         row--;
2766 #else
2767         row = rmap_i[row];
2768 #endif
2769         ilen  = imat_ilen[row];
2770         mat_i = imat_i[row];
2771         mat_a = imat_a + mat_i;
2772         mat_j = imat_j + mat_i;
2773         max2  = rbuf2_i[ct1];
2774         if (!allcolumns[is_no]) {
2775           for (l = 0; l < max2; l++, ct2++) {
2776 #if defined(PETSC_USE_CTABLE)
2777             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2778 #else
2779             tcol = cmap_i[rbuf3_i[ct2]];
2780 #endif
2781             if (tcol) {
2782               *mat_j++ = tcol - 1;
2783               *mat_a++ = rbuf4_i[ct2];
2784               ilen++;
2785             }
2786           }
2787         } else { /* allcolumns */
2788           for (l = 0; l < max2; l++, ct2++) {
2789             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2790             *mat_a++ = rbuf4_i[ct2];
2791             ilen++;
2792           }
2793         }
2794         imat_ilen[row] = ilen;
2795       }
2796     }
2797   }
2798 
2799   if (!iscsorted) { /* sort column indices of the rows */
2800     for (i = 0; i < ismax; i++) {
2801       subc      = (Mat_SeqAIJ *)submats[i]->data;
2802       imat_j    = subc->j;
2803       imat_i    = subc->i;
2804       imat_a    = subc->a;
2805       imat_ilen = subc->ilen;
2806 
2807       if (allcolumns[i]) continue;
2808       jmax = nrow[i];
2809       for (j = 0; j < jmax; j++) {
2810         mat_i = imat_i[j];
2811         mat_a = imat_a + mat_i;
2812         mat_j = imat_j + mat_i;
2813         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2814       }
2815     }
2816   }
2817 
2818   PetscCall(PetscFree(r_waits4));
2819   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2820   PetscCall(PetscFree(s_waits4));
2821 
2822   /* Restore the indices */
2823   for (i = 0; i < ismax; i++) {
2824     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2825     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2826   }
2827 
2828   for (i = 0; i < ismax; i++) {
2829     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2830     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2831   }
2832 
2833   /* Destroy allocated memory */
2834   if (sbuf_aa) {
2835     PetscCall(PetscFree(sbuf_aa[0]));
2836     PetscCall(PetscFree(sbuf_aa));
2837   }
2838   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2839 
2840   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2841   PetscCall(PetscFree(rbuf4));
2842 
2843   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2844   PetscFunctionReturn(PETSC_SUCCESS);
2845 }
2846 
2847 /*
2848  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2849  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2850  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2851  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2852  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2853  state, and needs to be "assembled" later by compressing B's column space.
2854 
2855  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2856  Following this call, C->A & C->B have been created, even if empty.
2857  */
2858 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2859 {
2860   /* If making this function public, change the error returned in this function away from _PLIB. */
2861   Mat_MPIAIJ     *aij;
2862   Mat_SeqAIJ     *Baij;
2863   PetscBool       seqaij, Bdisassembled;
2864   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2865   PetscScalar     v;
2866   const PetscInt *rowindices, *colindices;
2867 
2868   PetscFunctionBegin;
2869   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2870   if (A) {
2871     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2872     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2873     if (rowemb) {
2874       PetscCall(ISGetLocalSize(rowemb, &m));
2875       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2876     } else {
2877       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2878     }
2879     if (dcolemb) {
2880       PetscCall(ISGetLocalSize(dcolemb, &n));
2881       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2882     } else {
2883       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2884     }
2885   }
2886   if (B) {
2887     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2888     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2889     if (rowemb) {
2890       PetscCall(ISGetLocalSize(rowemb, &m));
2891       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2892     } else {
2893       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2894     }
2895     if (ocolemb) {
2896       PetscCall(ISGetLocalSize(ocolemb, &n));
2897       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2898     } else {
2899       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2900     }
2901   }
2902 
2903   aij = (Mat_MPIAIJ *)C->data;
2904   if (!aij->A) {
2905     /* Mimic parts of MatMPIAIJSetPreallocation() */
2906     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2907     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2908     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2909     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2910   }
2911   if (A) {
2912     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2913   } else {
2914     PetscCall(MatSetUp(aij->A));
2915   }
2916   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2917     /*
2918       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2919       need to "disassemble" B -- convert it to using C's global indices.
2920       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2921 
2922       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2923       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2924 
2925       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2926       At least avoid calling MatSetValues() and the implied searches?
2927     */
2928 
2929     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2930 #if defined(PETSC_USE_CTABLE)
2931       PetscCall(PetscHMapIDestroy(&aij->colmap));
2932 #else
2933       PetscCall(PetscFree(aij->colmap));
2934       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2935 #endif
2936       ngcol = 0;
2937       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2938       if (aij->garray) PetscCall(PetscFree(aij->garray));
2939       PetscCall(VecDestroy(&aij->lvec));
2940       PetscCall(VecScatterDestroy(&aij->Mvctx));
2941     }
2942     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2943     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2944   }
2945   Bdisassembled = PETSC_FALSE;
2946   if (!aij->B) {
2947     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2948     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2949     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2950     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2951     Bdisassembled = PETSC_TRUE;
2952   }
2953   if (B) {
2954     Baij = (Mat_SeqAIJ *)B->data;
2955     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2956       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2957       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2958       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2959       PetscCall(PetscFree(nz));
2960     }
2961 
2962     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2963     shift      = rend - rstart;
2964     count      = 0;
2965     rowindices = NULL;
2966     colindices = NULL;
2967     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2968     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2969     for (i = 0; i < B->rmap->n; i++) {
2970       PetscInt row;
2971       row = i;
2972       if (rowindices) row = rowindices[i];
2973       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2974         col = Baij->j[count];
2975         if (colindices) col = colindices[col];
2976         if (Bdisassembled && col >= rstart) col += shift;
2977         v = Baij->a[count];
2978         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2979         ++count;
2980       }
2981     }
2982     /* No assembly for aij->B is necessary. */
2983     /* FIXME: set aij->B's nonzerostate correctly. */
2984   } else {
2985     PetscCall(MatSetUp(aij->B));
2986   }
2987   C->preallocated  = PETSC_TRUE;
2988   C->was_assembled = PETSC_FALSE;
2989   C->assembled     = PETSC_FALSE;
2990   /*
2991       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2992       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2993    */
2994   PetscFunctionReturn(PETSC_SUCCESS);
2995 }
2996 
2997 /*
2998   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2999  */
3000 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
3001 {
3002   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
3003 
3004   PetscFunctionBegin;
3005   PetscValidPointer(A, 2);
3006   PetscValidPointer(B, 3);
3007   /* FIXME: make sure C is assembled */
3008   *A = aij->A;
3009   *B = aij->B;
3010   /* Note that we don't incref *A and *B, so be careful! */
3011   PetscFunctionReturn(PETSC_SUCCESS);
3012 }
3013 
3014 /*
3015   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3016   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3017 */
3018 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3019 {
3020   PetscMPIInt size, flag;
3021   PetscInt    i, ii, cismax, ispar;
3022   Mat        *A, *B;
3023   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3024 
3025   PetscFunctionBegin;
3026   if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);
3027 
3028   for (i = 0, cismax = 0; i < ismax; ++i) {
3029     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3030     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3031     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3032     if (size > 1) ++cismax;
3033   }
3034 
3035   /*
3036      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3037      ispar counts the number of parallel ISs across C's comm.
3038   */
3039   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3040   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3041     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3042     PetscFunctionReturn(PETSC_SUCCESS);
3043   }
3044 
3045   /* if (ispar) */
3046   /*
3047     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3048     These are used to extract the off-diag portion of the resulting parallel matrix.
3049     The row IS for the off-diag portion is the same as for the diag portion,
3050     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3051   */
3052   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3053   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3054   for (i = 0, ii = 0; i < ismax; ++i) {
3055     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3056     if (size > 1) {
3057       /*
3058          TODO: This is the part that's ***NOT SCALABLE***.
3059          To fix this we need to extract just the indices of C's nonzero columns
3060          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3061          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3062          to be done without serializing on the IS list, so, most likely, it is best
3063          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3064       */
3065       PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii])));
3066       /* Now we have to
3067          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3068              were sorted on each rank, concatenated they might no longer be sorted;
3069          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3070              indices in the nondecreasing order to the original index positions.
3071          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3072       */
3073       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3074       PetscCall(ISSort(ciscol[ii]));
3075       ++ii;
3076     }
3077   }
3078   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3079   for (i = 0, ii = 0; i < ismax; ++i) {
3080     PetscInt        j, issize;
3081     const PetscInt *indices;
3082 
3083     /*
3084        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3085      */
3086     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3087     PetscCall(ISSort(isrow[i]));
3088     PetscCall(ISGetLocalSize(isrow[i], &issize));
3089     PetscCall(ISGetIndices(isrow[i], &indices));
3090     for (j = 1; j < issize; ++j) {
3091       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3092     }
3093     PetscCall(ISRestoreIndices(isrow[i], &indices));
3094     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3095     PetscCall(ISSort(iscol[i]));
3096     PetscCall(ISGetLocalSize(iscol[i], &issize));
3097     PetscCall(ISGetIndices(iscol[i], &indices));
3098     for (j = 1; j < issize; ++j) {
3099       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3100     }
3101     PetscCall(ISRestoreIndices(iscol[i], &indices));
3102     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3103     if (size > 1) {
3104       cisrow[ii] = isrow[i];
3105       ++ii;
3106     }
3107   }
3108   /*
3109     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3110     array of sequential matrices underlying the resulting parallel matrices.
3111     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3112     contain duplicates.
3113 
3114     There are as many diag matrices as there are original index sets. There are only as many parallel
3115     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3116 
3117     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3118     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3119       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3120       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3121       with A[i] and B[ii] extracted from the corresponding MPI submat.
3122     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3123       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3124       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3125       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3126       values into A[i] and B[ii] sitting inside the corresponding submat.
3127     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3128       A[i], B[ii], AA[i] or BB[ii] matrices.
3129   */
3130   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3131   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3132 
3133   /* Now obtain the sequential A and B submatrices separately. */
3134   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3135   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3136   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3137 
3138   /*
3139     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3140     matrices A & B have been extracted directly into the parallel matrices containing them, or
3141     simply into the sequential matrix identical with the corresponding A (if size == 1).
3142     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3143     to have the same sparsity pattern.
3144     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3145     must be constructed for C. This is done by setseqmat(s).
3146   */
3147   for (i = 0, ii = 0; i < ismax; ++i) {
3148     /*
3149        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3150        That way we can avoid sorting and computing permutations when reusing.
3151        To this end:
3152         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3153         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3154           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3155     */
3156     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3157 
3158     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3159     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3160     if (size > 1) {
3161       if (scall == MAT_INITIAL_MATRIX) {
3162         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3163         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3164         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3165         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3166         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3167       }
3168       /*
3169         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3170       */
3171       {
3172         Mat AA = A[i], BB = B[ii];
3173 
3174         if (AA || BB) {
3175           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3176           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3177           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3178         }
3179         PetscCall(MatDestroy(&AA));
3180       }
3181       PetscCall(ISDestroy(ciscol + ii));
3182       PetscCall(ISDestroy(ciscol_p + ii));
3183       ++ii;
3184     } else { /* if (size == 1) */
3185       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3186       if (isrow_p[i] || iscol_p[i]) {
3187         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3188         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3189         /* Otherwise A is extracted straight into (*submats)[i]. */
3190         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3191         PetscCall(MatDestroy(A + i));
3192       } else (*submat)[i] = A[i];
3193     }
3194     PetscCall(ISDestroy(&isrow_p[i]));
3195     PetscCall(ISDestroy(&iscol_p[i]));
3196   }
3197   PetscCall(PetscFree2(cisrow, ciscol));
3198   PetscCall(PetscFree2(isrow_p, iscol_p));
3199   PetscCall(PetscFree(ciscol_p));
3200   PetscCall(PetscFree(A));
3201   PetscCall(MatDestroySubMatrices(cismax, &B));
3202   PetscFunctionReturn(PETSC_SUCCESS);
3203 }
3204 
3205 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3206 {
3207   PetscFunctionBegin;
3208   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3209   PetscFunctionReturn(PETSC_SUCCESS);
3210 }
3211