xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 89669be4d29968dc8d4c19ce1b69194a6a561ea4)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscInt       i;
24 
25   PetscFunctionBegin;
26   PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
27   for (i=0; i<ov; ++i) {
28     PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C,imax,is));
29   }
30   PetscFunctionReturn(0);
31 }
32 
33 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
34 {
35   PetscInt       i;
36 
37   PetscFunctionBegin;
38   PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
39   for (i=0; i<ov; ++i) {
40     PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is));
41   }
42   PetscFunctionReturn(0);
43 }
44 
45 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
46 {
47   MPI_Comm       comm;
48   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
49   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
50   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
51   const PetscInt *indices_i,**indices;
52   PetscLayout    rmap;
53   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
54   PetscSF        sf;
55   PetscSFNode    *remote;
56 
57   PetscFunctionBegin;
58   PetscCall(PetscObjectGetComm((PetscObject)mat,&comm));
59   PetscCallMPI(MPI_Comm_rank(comm,&rank));
60   PetscCallMPI(MPI_Comm_size(comm,&size));
61   /* get row map to determine where rows should be going */
62   PetscCall(MatGetLayouts(mat,&rmap,NULL));
63   /* retrieve IS data and put all together so that we
64    * can optimize communication
65    *  */
66   PetscCall(PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length));
67   for (i=0,tlength=0; i<nidx; i++) {
68     PetscCall(ISGetLocalSize(is[i],&length[i]));
69     tlength += length[i];
70     PetscCall(ISGetIndices(is[i],&indices[i]));
71   }
72   /* find these rows on remote processors */
73   PetscCall(PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids));
74   PetscCall(PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp));
75   nrrows = 0;
76   for (i=0; i<nidx; i++) {
77     length_i  = length[i];
78     indices_i = indices[i];
79     for (j=0; j<length_i; j++) {
80       owner = -1;
81       PetscCall(PetscLayoutFindOwner(rmap,indices_i[j],&owner));
82       /* remote processors */
83       if (owner != rank) {
84         tosizes_temp[owner]++; /* number of rows to owner */
85         rrow_ranks[nrrows]  = owner; /* processor */
86         rrow_isids[nrrows]   = i; /* is id */
87         remoterows[nrrows++] = indices_i[j]; /* row */
88       }
89     }
90     PetscCall(ISRestoreIndices(is[i],&indices[i]));
91   }
92   PetscCall(PetscFree2(*(PetscInt***)&indices,length));
93   /* test if we need to exchange messages
94    * generally speaking, we do not need to exchange
95    * data when overlap is 1
96    * */
97   PetscCall(MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm));
98   /* we do not have any messages
99    * It usually corresponds to overlap 1
100    * */
101   if (!reducednrrows) {
102     PetscCall(PetscFree3(toranks,tosizes,tosizes_temp));
103     PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids));
104     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is));
105     PetscFunctionReturn(0);
106   }
107   nto = 0;
108   /* send sizes and ranks for building a two-sided communcation */
109   for (i=0; i<size; i++) {
110     if (tosizes_temp[i]) {
111       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
112       tosizes_temp[i] = nto; /* a map from processor to index */
113       toranks[nto++]  = i; /* processor */
114     }
115   }
116   PetscCall(PetscMalloc1(nto+1,&toffsets));
117   toffsets[0] = 0;
118   for (i=0; i<nto; i++) {
119     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
120     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
121   }
122   /* send information to other processors */
123   PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes));
124   nrecvrows = 0;
125   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
126   PetscCall(PetscMalloc1(nrecvrows,&remote));
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) {
129     for (j=0; j<fromsizes[2*i]; j++) {
130       remote[nrecvrows].rank    = fromranks[i];
131       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
132     }
133   }
134   PetscCall(PetscSFCreate(comm,&sf));
135   PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER));
136   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
137   PetscCall(PetscSFSetType(sf,PETSCSFBASIC));
138   PetscCall(PetscSFSetFromOptions(sf));
139   /* message pair <no of is, row>  */
140   PetscCall(PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata));
141   for (i=0; i<nrrows; i++) {
142     owner = rrow_ranks[i]; /* processor */
143     j     = tosizes_temp[owner]; /* index */
144     todata[toffsets[j]++] = rrow_isids[i];
145     todata[toffsets[j]++] = remoterows[i];
146   }
147   PetscCall(PetscFree3(toranks,tosizes,tosizes_temp));
148   PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids));
149   PetscCall(PetscFree(toffsets));
150   PetscCall(PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE));
151   PetscCall(PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE));
152   PetscCall(PetscSFDestroy(&sf));
153   /* send rows belonging to the remote so that then we could get the overlapping data back */
154   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata));
155   PetscCall(PetscFree2(todata,fromdata));
156   PetscCall(PetscFree(fromsizes));
157   PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes));
158   PetscCall(PetscFree(fromranks));
159   nrecvrows = 0;
160   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
161   PetscCall(PetscCalloc1(nrecvrows,&todata));
162   PetscCall(PetscMalloc1(nrecvrows,&remote));
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) {
165     for (j=0; j<tosizes[2*i]; j++) {
166       remote[nrecvrows].rank    = toranks[i];
167       remote[nrecvrows++].index = tosizes[2*i+1]+j;
168     }
169   }
170   PetscCall(PetscSFCreate(comm,&sf));
171   PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER));
172   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
173   PetscCall(PetscSFSetType(sf,PETSCSFBASIC));
174   PetscCall(PetscSFSetFromOptions(sf));
175   /* overlap communication and computation */
176   PetscCall(PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE));
177   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is));
178   PetscCall(PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE));
179   PetscCall(PetscSFDestroy(&sf));
180   PetscCall(PetscFree2(sbdata,sbsizes));
181   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata));
182   PetscCall(PetscFree(toranks));
183   PetscCall(PetscFree(tosizes));
184   PetscCall(PetscFree(todata));
185   PetscFunctionReturn(0);
186 }
187 
188 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
189 {
190   PetscInt         *isz,isz_i,i,j,is_id, data_size;
191   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
192   const PetscInt   *indices_i_temp;
193   MPI_Comm         *iscomms;
194 
195   PetscFunctionBegin;
196   max_lsize = 0;
197   PetscCall(PetscMalloc1(nidx,&isz));
198   for (i=0; i<nidx; i++) {
199     PetscCall(ISGetLocalSize(is[i],&lsize));
200     max_lsize = lsize>max_lsize ? lsize:max_lsize;
201     isz[i]    = lsize;
202   }
203   PetscCall(PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms));
204   for (i=0; i<nidx; i++) {
205     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL));
206     PetscCall(ISGetIndices(is[i],&indices_i_temp));
207     PetscCall(PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]));
208     PetscCall(ISRestoreIndices(is[i],&indices_i_temp));
209     PetscCall(ISDestroy(&is[i]));
210   }
211   /* retrieve information to get row id and its overlap */
212   for (i=0; i<nrecvs;) {
213     is_id     = recvdata[i++];
214     data_size = recvdata[i++];
215     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
216     isz_i     = isz[is_id];
217     for (j=0; j< data_size; j++) {
218       col = recvdata[i++];
219       indices_i[isz_i++] = col;
220     }
221     isz[is_id] = isz_i;
222   }
223   /* remove duplicate entities */
224   for (i=0; i<nidx; i++) {
225     indices_i = indices_temp+(max_lsize+nrecvs)*i;
226     isz_i     = isz[i];
227     PetscCall(PetscSortRemoveDupsInt(&isz_i,indices_i));
228     PetscCall(ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]));
229     PetscCall(PetscCommDestroy(&iscomms[i]));
230   }
231   PetscCall(PetscFree(isz));
232   PetscCall(PetscFree2(indices_temp,iscomms));
233   PetscFunctionReturn(0);
234 }
235 
236 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
237 {
238   PetscLayout       rmap,cmap;
239   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
240   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
241   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
242   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
243   Mat               amat,bmat;
244   PetscMPIInt       rank;
245   PetscBool         done;
246   MPI_Comm          comm;
247 
248   PetscFunctionBegin;
249   PetscCall(PetscObjectGetComm((PetscObject)mat,&comm));
250   PetscCallMPI(MPI_Comm_rank(comm,&rank));
251   PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols));
252   /* Even if the mat is symmetric, we still assume it is not symmetric */
253   PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
254   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
255   PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
256   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
257   /* total number of nonzero values is used to estimate the memory usage in the next step */
258   tnz  = ai[an]+bi[bn];
259   PetscCall(MatGetLayouts(mat,&rmap,&cmap));
260   PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL));
261   PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL));
262   /* to find the longest message */
263   max_fszs = 0;
264   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
265   /* better way to estimate number of nonzero in the mat??? */
266   PetscCall(PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp));
267   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
268   rows_pos  = 0;
269   totalrows = 0;
270   for (i=0; i<nfrom; i++) {
271     PetscCall(PetscArrayzero(rows_pos_i,nidx));
272     /* group data together */
273     for (j=0; j<fromsizes[2*i]; j+=2) {
274       is_id                       = fromrows[rows_pos++];/* no of is */
275       rows_i                      = rows_data[is_id];
276       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
277     }
278     /* estimate a space to avoid multiple allocations  */
279     for (j=0; j<nidx; j++) {
280       indvc_ij = 0;
281       rows_i   = rows_data[j];
282       for (l=0; l<rows_pos_i[j]; l++) {
283         row    = rows_i[l]-rstart;
284         start  = ai[row];
285         end    = ai[row+1];
286         for (k=start; k<end; k++) { /* Amat */
287           col = aj[k] + cstart;
288           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
289         }
290         start = bi[row];
291         end   = bi[row+1];
292         for (k=start; k<end; k++) { /* Bmat */
293           col = gcols[bj[k]];
294           indices_tmp[indvc_ij++] = col;
295         }
296       }
297       PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp));
298       indv_counts[i*nidx+j] = indvc_ij;
299       totalrows            += indvc_ij;
300     }
301   }
302   /* message triple <no of is, number of rows, rows> */
303   PetscCall(PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes));
304   totalrows = 0;
305   rows_pos  = 0;
306   /* use this code again */
307   for (i=0;i<nfrom;i++) {
308     PetscCall(PetscArrayzero(rows_pos_i,nidx));
309     for (j=0; j<fromsizes[2*i]; j+=2) {
310       is_id                       = fromrows[rows_pos++];
311       rows_i                      = rows_data[is_id];
312       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
313     }
314     /* add data  */
315     for (j=0; j<nidx; j++) {
316       if (!indv_counts[i*nidx+j]) continue;
317       indvc_ij = 0;
318       sbdata[totalrows++] = j;
319       sbdata[totalrows++] = indv_counts[i*nidx+j];
320       sbsizes[2*i]       += 2;
321       rows_i              = rows_data[j];
322       for (l=0; l<rows_pos_i[j]; l++) {
323         row   = rows_i[l]-rstart;
324         start = ai[row];
325         end   = ai[row+1];
326         for (k=start; k<end; k++) { /* Amat */
327           col = aj[k] + cstart;
328           indices_tmp[indvc_ij++] = col;
329         }
330         start = bi[row];
331         end   = bi[row+1];
332         for (k=start; k<end; k++) { /* Bmat */
333           col = gcols[bj[k]];
334           indices_tmp[indvc_ij++] = col;
335         }
336       }
337       PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp));
338       sbsizes[2*i]  += indvc_ij;
339       PetscCall(PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij));
340       totalrows += indvc_ij;
341     }
342   }
343   PetscCall(PetscMalloc1(nfrom+1,&offsets));
344   offsets[0] = 0;
345   for (i=0; i<nfrom; i++) {
346     offsets[i+1]   = offsets[i] + sbsizes[2*i];
347     sbsizes[2*i+1] = offsets[i];
348   }
349   PetscCall(PetscFree(offsets));
350   if (sbrowsizes) *sbrowsizes = sbsizes;
351   if (sbrows) *sbrows = sbdata;
352   PetscCall(PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp));
353   PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
354   PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
355   PetscFunctionReturn(0);
356 }
357 
358 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
359 {
360   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
361   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
362   PetscInt          lsize,lsize_tmp;
363   PetscMPIInt       rank,owner;
364   Mat               amat,bmat;
365   PetscBool         done;
366   PetscLayout       cmap,rmap;
367   MPI_Comm          comm;
368 
369   PetscFunctionBegin;
370   PetscCall(PetscObjectGetComm((PetscObject)mat,&comm));
371   PetscCallMPI(MPI_Comm_rank(comm,&rank));
372   PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols));
373   PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
374   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
375   PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
376   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
377   /* is it a safe way to compute number of nonzero values ? */
378   tnz  = ai[an]+bi[bn];
379   PetscCall(MatGetLayouts(mat,&rmap,&cmap));
380   PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL));
381   PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL));
382   /* it is a better way to estimate memory than the old implementation
383    * where global size of matrix is used
384    * */
385   PetscCall(PetscMalloc1(tnz,&indices_temp));
386   for (i=0; i<nidx; i++) {
387     MPI_Comm iscomm;
388 
389     PetscCall(ISGetLocalSize(is[i],&lsize));
390     PetscCall(ISGetIndices(is[i],&indices));
391     lsize_tmp = 0;
392     for (j=0; j<lsize; j++) {
393       owner = -1;
394       row   = indices[j];
395       PetscCall(PetscLayoutFindOwner(rmap,row,&owner));
396       if (owner != rank) continue;
397       /* local number */
398       row  -= rstart;
399       start = ai[row];
400       end   = ai[row+1];
401       for (k=start; k<end; k++) { /* Amat */
402         col = aj[k] + cstart;
403         indices_temp[lsize_tmp++] = col;
404       }
405       start = bi[row];
406       end   = bi[row+1];
407       for (k=start; k<end; k++) { /* Bmat */
408         col = gcols[bj[k]];
409         indices_temp[lsize_tmp++] = col;
410       }
411     }
412    PetscCall(ISRestoreIndices(is[i],&indices));
413    PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL));
414    PetscCall(ISDestroy(&is[i]));
415    PetscCall(PetscSortRemoveDupsInt(&lsize_tmp,indices_temp));
416    PetscCall(ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]));
417    PetscCall(PetscCommDestroy(&iscomm));
418   }
419   PetscCall(PetscFree(indices_temp));
420   PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
421   PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
422   PetscFunctionReturn(0);
423 }
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests received (which have to be or which have been processed)
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
461   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
462   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
463   PetscBT        *table;
464   MPI_Comm       comm;
465   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
466   MPI_Status     *recv_status;
467   MPI_Comm       *iscomms;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag1));
477   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
478 
479   PetscCall(PetscMalloc2(imax,(PetscInt***)&idx,imax,&n));
480 
481   for (i=0; i<imax; i++) {
482     PetscCall(ISGetIndices(is[i],&idx[i]));
483     PetscCall(ISGetLocalSize(is[i],&n[i]));
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));
489   for (i=0; i<imax; i++) {
490     PetscCall(PetscArrayzero(w4,size)); /* initialise work vector*/
491     idx_i = idx[i];
492     len   = n[i];
493     for (j=0; j<len; j++) {
494       row = idx_i[j];
495       PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
496       PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc));
497       w4[proc]++;
498     }
499     for (j=0; j<size; j++) {
500       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
501     }
502   }
503 
504   nrqs     = 0;              /* no of outgoing messages */
505   msz      = 0;              /* total mesg length (for all proc */
506   w1[rank] = 0;              /* no mesg sent to intself */
507   w3[rank] = 0;
508   for (i=0; i<size; i++) {
509     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
510   }
511   /* pa - is list of processors to communicate with */
512   PetscCall(PetscMalloc1(nrqs,&pa));
513   for (i=0,j=0; i<size; i++) {
514     if (w1[i]) {pa[j] = i; j++;}
515   }
516 
517   /* Each message would have a header = 1 + 2*(no of IS) + data */
518   for (i=0; i<nrqs; i++) {
519     j      = pa[i];
520     w1[j] += w2[j] + 2*w3[j];
521     msz   += w1[j];
522   }
523 
524   /* Determine the number of messages to expect, their lengths, from from-ids */
525   PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
526   PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
527 
528   /* Now post the Irecvs corresponding to these messages */
529   PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1));
530 
531   /* Allocate Memory for outgoing messages */
532   PetscCall(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr));
533   PetscCall(PetscArrayzero(outdat,size));
534   PetscCall(PetscArrayzero(ptr,size));
535 
536   {
537     PetscInt *iptr = tmp,ict  = 0;
538     for (i=0; i<nrqs; i++) {
539       j         = pa[i];
540       iptr     +=  ict;
541       outdat[j] = iptr;
542       ict       = w1[j];
543     }
544   }
545 
546   /* Form the outgoing messages */
547   /* plug in the headers */
548   for (i=0; i<nrqs; i++) {
549     j            = pa[i];
550     outdat[j][0] = 0;
551     PetscCall(PetscArrayzero(outdat[j]+1,2*w3[j]));
552     ptr[j]       = outdat[j] + 2*w3[j] + 1;
553   }
554 
555   /* Memory for doing local proc's work */
556   {
557     PetscInt M_BPB_imax = 0;
558 #if defined(PETSC_USE_CTABLE)
559     PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax));
560     PetscCall(PetscMalloc1(imax,&table_data));
561     for (i=0; i<imax; i++) {
562       PetscCall(PetscTableCreate(n[i],M,&table_data[i]));
563     }
564     PetscCall(PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p));
565     for (i=0; i<imax; i++) {
566       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
567     }
568 #else
569     PetscInt Mimax = 0;
570     PetscCall(PetscIntMultError(M,imax, &Mimax));
571     PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax));
572     PetscCall(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p));
573     for (i=0; i<imax; i++) {
574       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
575       data[i]  = d_p + M*i;
576     }
577 #endif
578   }
579 
580   /* Parse the IS and update local tables and the outgoing buf with the data */
581   {
582     PetscInt n_i,isz_i,*outdat_j,ctr_j;
583     PetscBT  table_i;
584 
585     for (i=0; i<imax; i++) {
586       PetscCall(PetscArrayzero(ctr,size));
587       n_i     = n[i];
588       table_i = table[i];
589       idx_i   = idx[i];
590 #if defined(PETSC_USE_CTABLE)
591       table_data_i = table_data[i];
592 #else
593       data_i  = data[i];
594 #endif
595       isz_i   = isz[i];
596       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
597         row  = idx_i[j];
598         PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc));
599         if (proc != rank) { /* copy to the outgoing buffer */
600           ctr[proc]++;
601           *ptr[proc] = row;
602           ptr[proc]++;
603         } else if (!PetscBTLookupSet(table_i,row)) {
604 #if defined(PETSC_USE_CTABLE)
605           PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES));
606 #else
607           data_i[isz_i] = row; /* Update the local table */
608 #endif
609           isz_i++;
610         }
611       }
612       /* Update the headers for the current IS */
613       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
614         if ((ctr_j = ctr[j])) {
615           outdat_j        = outdat[j];
616           k               = ++outdat_j[0];
617           outdat_j[2*k]   = ctr_j;
618           outdat_j[2*k-1] = i;
619         }
620       }
621       isz[i] = isz_i;
622     }
623   }
624 
625   /*  Now  post the sends */
626   PetscCall(PetscMalloc1(nrqs,&s_waits1));
627   for (i=0; i<nrqs; ++i) {
628     j    = pa[i];
629     PetscCallMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i));
630   }
631 
632   /* No longer need the original indices */
633   PetscCall(PetscMalloc1(imax,&iscomms));
634   for (i=0; i<imax; ++i) {
635     PetscCall(ISRestoreIndices(is[i],idx+i));
636     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL));
637   }
638   PetscCall(PetscFree2(*(PetscInt***)&idx,n));
639 
640   for (i=0; i<imax; ++i) {
641     PetscCall(ISDestroy(&is[i]));
642   }
643 
644   /* Do Local work */
645 #if defined(PETSC_USE_CTABLE)
646   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data));
647 #else
648   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL));
649 #endif
650 
651   /* Receive messages */
652   PetscCall(PetscMalloc1(nrqr,&recv_status));
653   PetscCallMPI(MPI_Waitall(nrqr,r_waits1,recv_status));
654   PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
655 
656   /* Phase 1 sends are complete - deallocate buffers */
657   PetscCall(PetscFree4(outdat,ptr,tmp,ctr));
658   PetscCall(PetscFree4(w1,w2,w3,w4));
659 
660   PetscCall(PetscMalloc1(nrqr,&xdata));
661   PetscCall(PetscMalloc1(nrqr,&isz1));
662   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1));
663   PetscCall(PetscFree(rbuf[0]));
664   PetscCall(PetscFree(rbuf));
665 
666   /* Send the data back */
667   /* Do a global reduction to know the buffer space req for incoming messages */
668   {
669     PetscMPIInt *rw1;
670 
671     PetscCall(PetscCalloc1(size,&rw1));
672 
673     for (i=0; i<nrqr; ++i) {
674       proc = recv_status[i].MPI_SOURCE;
675 
676       PetscCheck(proc == onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
677       rw1[proc] = isz1[i];
678     }
679     PetscCall(PetscFree(onodes1));
680     PetscCall(PetscFree(olengths1));
681 
682     /* Determine the number of messages to expect, their lengths, from from-ids */
683     PetscCall(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2));
684     PetscCall(PetscFree(rw1));
685   }
686   /* Now post the Irecvs corresponding to these messages */
687   PetscCall(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2));
688 
689   /* Now  post the sends */
690   PetscCall(PetscMalloc1(nrqr,&s_waits2));
691   for (i=0; i<nrqr; ++i) {
692     j    = recv_status[i].MPI_SOURCE;
693     PetscCallMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i));
694   }
695 
696   /* receive work done on other processors */
697   {
698     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
699     PetscMPIInt idex;
700     PetscBT     table_i;
701 
702     for (i=0; i<nrqs; ++i) {
703       PetscCallMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE));
704       /* Process the message */
705       rbuf2_i = rbuf2[idex];
706       ct1     = 2*rbuf2_i[0]+1;
707       jmax    = rbuf2[idex][0];
708       for (j=1; j<=jmax; j++) {
709         max     = rbuf2_i[2*j];
710         is_no   = rbuf2_i[2*j-1];
711         isz_i   = isz[is_no];
712         table_i = table[is_no];
713 #if defined(PETSC_USE_CTABLE)
714         table_data_i = table_data[is_no];
715 #else
716         data_i  = data[is_no];
717 #endif
718         for (k=0; k<max; k++,ct1++) {
719           row = rbuf2_i[ct1];
720           if (!PetscBTLookupSet(table_i,row)) {
721 #if defined(PETSC_USE_CTABLE)
722             PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES));
723 #else
724             data_i[isz_i] = row;
725 #endif
726             isz_i++;
727           }
728         }
729         isz[is_no] = isz_i;
730       }
731     }
732 
733     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
734   }
735 
736 #if defined(PETSC_USE_CTABLE)
737   tcount_max = 0;
738   for (i=0; i<imax; ++i) {
739     table_data_i = table_data[i];
740     PetscCall(PetscTableGetCount(table_data_i,&tcount));
741     if (tcount_max < tcount) tcount_max = tcount;
742   }
743   PetscCall(PetscMalloc1(tcount_max+1,&tdata));
744 #endif
745 
746   for (i=0; i<imax; ++i) {
747 #if defined(PETSC_USE_CTABLE)
748     PetscTablePosition tpos;
749     table_data_i = table_data[i];
750 
751     PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos));
752     while (tpos) {
753       PetscCall(PetscTableGetNext(table_data_i,&tpos,&k,&j));
754       tdata[--j] = --k;
755     }
756     PetscCall(ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i));
757 #else
758     PetscCall(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i));
759 #endif
760     PetscCall(PetscCommDestroy(&iscomms[i]));
761   }
762 
763   PetscCall(PetscFree(iscomms));
764   PetscCall(PetscFree(onodes2));
765   PetscCall(PetscFree(olengths2));
766 
767   PetscCall(PetscFree(pa));
768   PetscCall(PetscFree(rbuf2[0]));
769   PetscCall(PetscFree(rbuf2));
770   PetscCall(PetscFree(s_waits1));
771   PetscCall(PetscFree(r_waits1));
772   PetscCall(PetscFree(s_waits2));
773   PetscCall(PetscFree(r_waits2));
774   PetscCall(PetscFree(recv_status));
775   if (xdata) {
776     PetscCall(PetscFree(xdata[0]));
777     PetscCall(PetscFree(xdata));
778   }
779   PetscCall(PetscFree(isz1));
780 #if defined(PETSC_USE_CTABLE)
781   for (i=0; i<imax; i++) {
782     PetscCall(PetscTableDestroy((PetscTable*)&table_data[i]));
783   }
784   PetscCall(PetscFree(table_data));
785   PetscCall(PetscFree(tdata));
786   PetscCall(PetscFree4(table,data,isz,t_p));
787 #else
788   PetscCall(PetscFree5(table,data,isz,d_p,t_p));
789 #endif
790   PetscFunctionReturn(0);
791 }
792 
793 /*
794    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
795        the work on the local processor.
796 
797      Inputs:
798       C      - MAT_MPIAIJ;
799       imax - total no of index sets processed at a time;
800       table  - an array of char - size = m bits.
801 
802      Output:
803       isz    - array containing the count of the solution elements corresponding
804                to each index set;
805       data or table_data  - pointer to the solutions
806 */
807 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
808 {
809   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
810   Mat        A  = c->A,B = c->B;
811   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
812   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
813   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
814   PetscBT    table_i;
815 #if defined(PETSC_USE_CTABLE)
816   PetscTable         table_data_i;
817   PetscTablePosition tpos;
818   PetscInt           tcount,*tdata;
819 #else
820   PetscInt           *data_i;
821 #endif
822 
823   PetscFunctionBegin;
824   rstart = C->rmap->rstart;
825   cstart = C->cmap->rstart;
826   ai     = a->i;
827   aj     = a->j;
828   bi     = b->i;
829   bj     = b->j;
830   garray = c->garray;
831 
832   for (i=0; i<imax; i++) {
833 #if defined(PETSC_USE_CTABLE)
834     /* copy existing entries of table_data_i into tdata[] */
835     table_data_i = table_data[i];
836     PetscCall(PetscTableGetCount(table_data_i,&tcount));
837     PetscCheck(tcount == isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]);
838 
839     PetscCall(PetscMalloc1(tcount,&tdata));
840     PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos));
841     while (tpos) {
842       PetscCall(PetscTableGetNext(table_data_i,&tpos,&row,&j));
843       tdata[--j] = --row;
844       PetscCheck(j <= tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount);
845     }
846 #else
847     data_i  = data[i];
848 #endif
849     table_i = table[i];
850     isz_i   = isz[i];
851     max     = isz[i];
852 
853     for (j=0; j<max; j++) {
854 #if defined(PETSC_USE_CTABLE)
855       row   = tdata[j] - rstart;
856 #else
857       row   = data_i[j] - rstart;
858 #endif
859       start = ai[row];
860       end   = ai[row+1];
861       for (k=start; k<end; k++) { /* Amat */
862         val = aj[k] + cstart;
863         if (!PetscBTLookupSet(table_i,val)) {
864 #if defined(PETSC_USE_CTABLE)
865           PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES));
866 #else
867           data_i[isz_i] = val;
868 #endif
869           isz_i++;
870         }
871       }
872       start = bi[row];
873       end   = bi[row+1];
874       for (k=start; k<end; k++) { /* Bmat */
875         val = garray[bj[k]];
876         if (!PetscBTLookupSet(table_i,val)) {
877 #if defined(PETSC_USE_CTABLE)
878           PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES));
879 #else
880           data_i[isz_i] = val;
881 #endif
882           isz_i++;
883         }
884       }
885     }
886     isz[i] = isz_i;
887 
888 #if defined(PETSC_USE_CTABLE)
889     PetscCall(PetscFree(tdata));
890 #endif
891   }
892   PetscFunctionReturn(0);
893 }
894 
895 /*
896       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
897          and return the output
898 
899          Input:
900            C    - the matrix
901            nrqr - no of messages being processed.
902            rbuf - an array of pointers to the received requests
903 
904          Output:
905            xdata - array of messages to be sent back
906            isz1  - size of each message
907 
908   For better efficiency perhaps we should malloc separately each xdata[i],
909 then if a remalloc is required we need only copy the data for that one row
910 rather then all previous rows as it is now where a single large chunk of
911 memory is used.
912 
913 */
914 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
915 {
916   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
917   Mat            A  = c->A,B = c->B;
918   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
919   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
920   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
921   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
922   PetscInt       *rbuf_i,kmax,rbuf_0;
923   PetscBT        xtable;
924 
925   PetscFunctionBegin;
926   m      = C->rmap->N;
927   rstart = C->rmap->rstart;
928   cstart = C->cmap->rstart;
929   ai     = a->i;
930   aj     = a->j;
931   bi     = b->i;
932   bj     = b->j;
933   garray = c->garray;
934 
935   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
936     rbuf_i =  rbuf[i];
937     rbuf_0 =  rbuf_i[0];
938     ct    += rbuf_0;
939     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
940   }
941 
942   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
943   else max1 = 1;
944   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
945   if (nrqr) {
946     PetscCall(PetscMalloc1(mem_estimate,&xdata[0]));
947     ++no_malloc;
948   }
949   PetscCall(PetscBTCreate(m,&xtable));
950   PetscCall(PetscArrayzero(isz1,nrqr));
951 
952   ct3 = 0;
953   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
954     rbuf_i =  rbuf[i];
955     rbuf_0 =  rbuf_i[0];
956     ct1    =  2*rbuf_0+1;
957     ct2    =  ct1;
958     ct3   += ct1;
959     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
960       PetscCall(PetscBTMemzero(m,xtable));
961       oct2 = ct2;
962       kmax = rbuf_i[2*j];
963       for (k=0; k<kmax; k++,ct1++) {
964         row = rbuf_i[ct1];
965         if (!PetscBTLookupSet(xtable,row)) {
966           if (!(ct3 < mem_estimate)) {
967             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
968             PetscCall(PetscMalloc1(new_estimate,&tmp));
969             PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
970             PetscCall(PetscFree(xdata[0]));
971             xdata[0]     = tmp;
972             mem_estimate = new_estimate; ++no_malloc;
973             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
974           }
975           xdata[i][ct2++] = row;
976           ct3++;
977         }
978       }
979       for (k=oct2,max2=ct2; k<max2; k++) {
980         row   = xdata[i][k] - rstart;
981         start = ai[row];
982         end   = ai[row+1];
983         for (l=start; l<end; l++) {
984           val = aj[l] + cstart;
985           if (!PetscBTLookupSet(xtable,val)) {
986             if (!(ct3 < mem_estimate)) {
987               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
988               PetscCall(PetscMalloc1(new_estimate,&tmp));
989               PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
990               PetscCall(PetscFree(xdata[0]));
991               xdata[0]     = tmp;
992               mem_estimate = new_estimate; ++no_malloc;
993               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
994             }
995             xdata[i][ct2++] = val;
996             ct3++;
997           }
998         }
999         start = bi[row];
1000         end   = bi[row+1];
1001         for (l=start; l<end; l++) {
1002           val = garray[bj[l]];
1003           if (!PetscBTLookupSet(xtable,val)) {
1004             if (!(ct3 < mem_estimate)) {
1005               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1006               PetscCall(PetscMalloc1(new_estimate,&tmp));
1007               PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
1008               PetscCall(PetscFree(xdata[0]));
1009               xdata[0]     = tmp;
1010               mem_estimate = new_estimate; ++no_malloc;
1011               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1012             }
1013             xdata[i][ct2++] = val;
1014             ct3++;
1015           }
1016         }
1017       }
1018       /* Update the header*/
1019       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1020       xdata[i][2*j-1] = rbuf_i[2*j-1];
1021     }
1022     xdata[i][0] = rbuf_0;
1023     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1024     isz1[i]     = ct2; /* size of each message */
1025   }
1026   PetscCall(PetscBTDestroy(&xtable));
1027   PetscCall(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc));
1028   PetscFunctionReturn(0);
1029 }
1030 /* -------------------------------------------------------------------------*/
1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1032 /*
1033     Every processor gets the entire matrix
1034 */
1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1036 {
1037   Mat            B;
1038   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1039   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1040   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1041   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1042   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1043 
1044   PetscFunctionBegin;
1045   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1046   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank));
1047   if (scall == MAT_INITIAL_MATRIX) {
1048     /* ----------------------------------------------------------------
1049          Tell every processor the number of nonzeros per row
1050     */
1051     PetscCall(PetscMalloc1(A->rmap->N,&lens));
1052     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1053       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1054     }
1055     PetscCall(PetscMalloc2(size,&recvcounts,size,&displs));
1056     for (i=0; i<size; i++) {
1057       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1058       displs[i]     = A->rmap->range[i];
1059     }
1060     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A)));
1061     /* ---------------------------------------------------------------
1062          Create the sequential matrix of the same type as the local block diagonal
1063     */
1064     PetscCall(MatCreate(PETSC_COMM_SELF,&B));
1065     PetscCall(MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE));
1066     PetscCall(MatSetBlockSizesFromMats(B,A,A));
1067     PetscCall(MatSetType(B,((PetscObject)a->A)->type_name));
1068     PetscCall(MatSeqAIJSetPreallocation(B,0,lens));
1069     PetscCall(PetscCalloc1(2,Bin));
1070     **Bin = B;
1071     b     = (Mat_SeqAIJ*)B->data;
1072 
1073     /*--------------------------------------------------------------------
1074        Copy my part of matrix column indices over
1075     */
1076     sendcount  = ad->nz + bd->nz;
1077     jsendbuf   = b->j + b->i[rstarts[rank]];
1078     a_jsendbuf = ad->j;
1079     b_jsendbuf = bd->j;
1080     n          = A->rmap->rend - A->rmap->rstart;
1081     cnt        = 0;
1082     for (i=0; i<n; i++) {
1083       /* put in lower diagonal portion */
1084       m = bd->i[i+1] - bd->i[i];
1085       while (m > 0) {
1086         /* is it above diagonal (in bd (compressed) numbering) */
1087         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1088         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1089         m--;
1090       }
1091 
1092       /* put in diagonal portion */
1093       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1094         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1095       }
1096 
1097       /* put in upper diagonal portion */
1098       while (m-- > 0) {
1099         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1100       }
1101     }
1102     PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1103 
1104     /*--------------------------------------------------------------------
1105        Gather all column indices to all processors
1106     */
1107     for (i=0; i<size; i++) {
1108       recvcounts[i] = 0;
1109       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1110         recvcounts[i] += lens[j];
1111       }
1112     }
1113     displs[0] = 0;
1114     for (i=1; i<size; i++) {
1115       displs[i] = displs[i-1] + recvcounts[i-1];
1116     }
1117     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A)));
1118     /*--------------------------------------------------------------------
1119         Assemble the matrix into useable form (note numerical values not yet set)
1120     */
1121     /* set the b->ilen (length of each row) values */
1122     PetscCall(PetscArraycpy(b->ilen,lens,A->rmap->N));
1123     /* set the b->i indices */
1124     b->i[0] = 0;
1125     for (i=1; i<=A->rmap->N; i++) {
1126       b->i[i] = b->i[i-1] + lens[i-1];
1127     }
1128     PetscCall(PetscFree(lens));
1129     PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1130     PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1131 
1132   } else {
1133     B = **Bin;
1134     b = (Mat_SeqAIJ*)B->data;
1135   }
1136 
1137   /*--------------------------------------------------------------------
1138        Copy my part of matrix numerical values into the values location
1139   */
1140   if (flag == MAT_GET_VALUES) {
1141     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1142     MatScalar         *sendbuf,*recvbuf;
1143 
1144     PetscCall(MatSeqAIJGetArrayRead(a->A,&ada));
1145     PetscCall(MatSeqAIJGetArrayRead(a->B,&bda));
1146     sendcount = ad->nz + bd->nz;
1147     sendbuf   = b->a + b->i[rstarts[rank]];
1148     a_sendbuf = ada;
1149     b_sendbuf = bda;
1150     b_sendj   = bd->j;
1151     n         = A->rmap->rend - A->rmap->rstart;
1152     cnt       = 0;
1153     for (i=0; i<n; i++) {
1154       /* put in lower diagonal portion */
1155       m = bd->i[i+1] - bd->i[i];
1156       while (m > 0) {
1157         /* is it above diagonal (in bd (compressed) numbering) */
1158         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1159         sendbuf[cnt++] = *b_sendbuf++;
1160         m--;
1161         b_sendj++;
1162       }
1163 
1164       /* put in diagonal portion */
1165       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1166         sendbuf[cnt++] = *a_sendbuf++;
1167       }
1168 
1169       /* put in upper diagonal portion */
1170       while (m-- > 0) {
1171         sendbuf[cnt++] = *b_sendbuf++;
1172         b_sendj++;
1173       }
1174     }
1175     PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1176 
1177     /* -----------------------------------------------------------------
1178        Gather all numerical values to all processors
1179     */
1180     if (!recvcounts) {
1181       PetscCall(PetscMalloc2(size,&recvcounts,size,&displs));
1182     }
1183     for (i=0; i<size; i++) {
1184       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1185     }
1186     displs[0] = 0;
1187     for (i=1; i<size; i++) {
1188       displs[i] = displs[i-1] + recvcounts[i-1];
1189     }
1190     recvbuf = b->a;
1191     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A)));
1192     PetscCall(MatSeqAIJRestoreArrayRead(a->A,&ada));
1193     PetscCall(MatSeqAIJRestoreArrayRead(a->B,&bda));
1194   }  /* endof (flag == MAT_GET_VALUES) */
1195   PetscCall(PetscFree2(recvcounts,displs));
1196 
1197   if (A->symmetric) {
1198     PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE));
1199   } else if (A->hermitian) {
1200     PetscCall(MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE));
1201   } else if (A->structurally_symmetric) {
1202     PetscCall(MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE));
1203   }
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1208 {
1209   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1210   Mat            submat,A = c->A,B = c->B;
1211   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1212   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1213   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1214   const PetscInt *icol,*irow;
1215   PetscInt       nrow,ncol,start;
1216   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1217   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1218   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1219   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1220   PetscInt       *lens,rmax,ncols,*cols,Crow;
1221 #if defined(PETSC_USE_CTABLE)
1222   PetscTable     cmap,rmap;
1223   PetscInt       *cmap_loc,*rmap_loc;
1224 #else
1225   PetscInt       *cmap,*rmap;
1226 #endif
1227   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1228   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1229   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1230   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1231   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1232   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1233   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1234   MPI_Comm       comm;
1235   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1236   PetscMPIInt    *onodes1,*olengths1,idex,end;
1237   Mat_SubSppt    *smatis1;
1238   PetscBool      isrowsorted,iscolsorted;
1239 
1240   PetscFunctionBegin;
1241   PetscValidLogicalCollectiveInt(C,ismax,2);
1242   PetscValidLogicalCollectiveEnum(C,scall,5);
1243   PetscCheck(ismax == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1244   PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a));
1245   PetscCall(MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a));
1246   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
1247   size = c->size;
1248   rank = c->rank;
1249 
1250   PetscCall(ISSorted(iscol[0],&iscolsorted));
1251   PetscCall(ISSorted(isrow[0],&isrowsorted));
1252   PetscCall(ISGetIndices(isrow[0],&irow));
1253   PetscCall(ISGetLocalSize(isrow[0],&nrow));
1254   if (allcolumns) {
1255     icol = NULL;
1256     ncol = C->cmap->N;
1257   } else {
1258     PetscCall(ISGetIndices(iscol[0],&icol));
1259     PetscCall(ISGetLocalSize(iscol[0],&ncol));
1260   }
1261 
1262   if (scall == MAT_INITIAL_MATRIX) {
1263     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1264 
1265     /* Get some new tags to keep the communication clean */
1266     tag1 = ((PetscObject)C)->tag;
1267     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
1268     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3));
1269 
1270     /* evaluate communication - mesg to who, length of mesg, and buffer space
1271      required. Based on this, buffers are allocated, and data copied into them */
1272     PetscCall(PetscCalloc2(size,&w1,size,&w2));
1273     PetscCall(PetscMalloc1(nrow,&row2proc));
1274 
1275     /* w1[proc] = num of rows owned by proc -- to be requested */
1276     proc = 0;
1277     nrqs = 0; /* num of outgoing messages */
1278     for (j=0; j<nrow; j++) {
1279       row  = irow[j];
1280       if (!isrowsorted) proc = 0;
1281       while (row >= C->rmap->range[proc+1]) proc++;
1282       w1[proc]++;
1283       row2proc[j] = proc; /* map row index to proc */
1284 
1285       if (proc != rank && !w2[proc]) {
1286         w2[proc] = 1; nrqs++;
1287       }
1288     }
1289     w1[rank] = 0;  /* rows owned by self will not be requested */
1290 
1291     PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/
1292     for (proc=0,j=0; proc<size; proc++) {
1293       if (w1[proc]) { pa[j++] = proc;}
1294     }
1295 
1296     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1297     msz = 0;              /* total mesg length (for all procs) */
1298     for (i=0; i<nrqs; i++) {
1299       proc      = pa[i];
1300       w1[proc] += 3;
1301       msz      += w1[proc];
1302     }
1303     PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz));
1304 
1305     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1306     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1307     PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
1308 
1309     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1310        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1311     PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
1312 
1313     /* Now post the Irecvs corresponding to these messages */
1314     PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1));
1315 
1316     PetscCall(PetscFree(onodes1));
1317     PetscCall(PetscFree(olengths1));
1318 
1319     /* Allocate Memory for outgoing messages */
1320     PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
1321     PetscCall(PetscArrayzero(sbuf1,size));
1322     PetscCall(PetscArrayzero(ptr,size));
1323 
1324     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1325     iptr = tmp;
1326     for (i=0; i<nrqs; i++) {
1327       proc        = pa[i];
1328       sbuf1[proc] = iptr;
1329       iptr       += w1[proc];
1330     }
1331 
1332     /* Form the outgoing messages */
1333     /* Initialize the header space */
1334     for (i=0; i<nrqs; i++) {
1335       proc      = pa[i];
1336       PetscCall(PetscArrayzero(sbuf1[proc],3));
1337       ptr[proc] = sbuf1[proc] + 3;
1338     }
1339 
1340     /* Parse the isrow and copy data into outbuf */
1341     PetscCall(PetscArrayzero(ctr,size));
1342     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1343       proc = row2proc[j];
1344       if (proc != rank) { /* copy to the outgoing buf*/
1345         *ptr[proc] = irow[j];
1346         ctr[proc]++; ptr[proc]++;
1347       }
1348     }
1349 
1350     /* Update the headers for the current IS */
1351     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1352       if ((ctr_j = ctr[j])) {
1353         sbuf1_j        = sbuf1[j];
1354         k              = ++sbuf1_j[0];
1355         sbuf1_j[2*k]   = ctr_j;
1356         sbuf1_j[2*k-1] = 0;
1357       }
1358     }
1359 
1360     /* Now post the sends */
1361     PetscCall(PetscMalloc1(nrqs,&s_waits1));
1362     for (i=0; i<nrqs; ++i) {
1363       proc = pa[i];
1364       PetscCallMPI(MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i));
1365     }
1366 
1367     /* Post Receives to capture the buffer size */
1368     PetscCall(PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2));
1369     PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3));
1370 
1371     if (nrqs) rbuf2[0] = tmp + msz;
1372     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1373 
1374     for (i=0; i<nrqs; ++i) {
1375       proc = pa[i];
1376       PetscCallMPI(MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i));
1377     }
1378 
1379     PetscCall(PetscFree2(w1,w2));
1380 
1381     /* Send to other procs the buf size they should allocate */
1382     /* Receive messages*/
1383     PetscCall(PetscMalloc1(nrqr,&r_status1));
1384     PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1));
1385 
1386     PetscCallMPI(MPI_Waitall(nrqr,r_waits1,r_status1));
1387     for (i=0; i<nrqr; ++i) {
1388       req_size[i] = 0;
1389       rbuf1_i        = rbuf1[i];
1390       start          = 2*rbuf1_i[0] + 1;
1391       PetscCallMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end));
1392       PetscCall(PetscMalloc1(end,&sbuf2[i]));
1393       sbuf2_i        = sbuf2[i];
1394       for (j=start; j<end; j++) {
1395         k            = rbuf1_i[j] - rstart;
1396         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1397         sbuf2_i[j]   = ncols;
1398         req_size[i] += ncols;
1399       }
1400       req_source1[i] = r_status1[i].MPI_SOURCE;
1401 
1402       /* form the header */
1403       sbuf2_i[0] = req_size[i];
1404       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1405 
1406       PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i));
1407     }
1408 
1409     PetscCall(PetscFree(r_status1));
1410     PetscCall(PetscFree(r_waits1));
1411 
1412     /* rbuf2 is received, Post recv column indices a->j */
1413     PetscCallMPI(MPI_Waitall(nrqs,r_waits2,r_status2));
1414 
1415     PetscCall(PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3));
1416     for (i=0; i<nrqs; ++i) {
1417       PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i]));
1418       req_source2[i] = r_status2[i].MPI_SOURCE;
1419       PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i));
1420     }
1421 
1422     /* Wait on sends1 and sends2 */
1423     PetscCall(PetscMalloc1(nrqs,&s_status1));
1424     PetscCallMPI(MPI_Waitall(nrqs,s_waits1,s_status1));
1425     PetscCall(PetscFree(s_waits1));
1426     PetscCall(PetscFree(s_status1));
1427 
1428     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,s_status2));
1429     PetscCall(PetscFree4(r_status2,s_waits2,r_waits2,s_status2));
1430 
1431     /* Now allocate sending buffers for a->j, and send them off */
1432     PetscCall(PetscMalloc1(nrqr,&sbuf_aj));
1433     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1434     if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0]));
1435     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1436 
1437     for (i=0; i<nrqr; i++) { /* for each requested message */
1438       rbuf1_i   = rbuf1[i];
1439       sbuf_aj_i = sbuf_aj[i];
1440       ct1       = 2*rbuf1_i[0] + 1;
1441       ct2       = 0;
1442       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1443 
1444       kmax = rbuf1[i][2];
1445       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1446         row    = rbuf1_i[ct1] - rstart;
1447         nzA    = ai[row+1] - ai[row];
1448         nzB    = bi[row+1] - bi[row];
1449         ncols  = nzA + nzB;
1450         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1451 
1452         /* load the column indices for this row into cols*/
1453         cols = sbuf_aj_i + ct2;
1454 
1455         lwrite = 0;
1456         for (l=0; l<nzB; l++) {
1457           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1458         }
1459         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1460         for (l=0; l<nzB; l++) {
1461           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1462         }
1463 
1464         ct2 += ncols;
1465       }
1466       PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i));
1467     }
1468 
1469     /* create column map (cmap): global col of C -> local col of submat */
1470 #if defined(PETSC_USE_CTABLE)
1471     if (!allcolumns) {
1472       PetscCall(PetscTableCreate(ncol,C->cmap->N,&cmap));
1473       PetscCall(PetscCalloc1(C->cmap->n,&cmap_loc));
1474       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1475         if (icol[j] >= cstart && icol[j] <cend) {
1476           cmap_loc[icol[j] - cstart] = j+1;
1477         } else { /* use PetscTable for non-local col indices */
1478           PetscCall(PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES));
1479         }
1480       }
1481     } else {
1482       cmap     = NULL;
1483       cmap_loc = NULL;
1484     }
1485     PetscCall(PetscCalloc1(C->rmap->n,&rmap_loc));
1486 #else
1487     if (!allcolumns) {
1488       PetscCall(PetscCalloc1(C->cmap->N,&cmap));
1489       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1490     } else {
1491       cmap = NULL;
1492     }
1493 #endif
1494 
1495     /* Create lens for MatSeqAIJSetPreallocation() */
1496     PetscCall(PetscCalloc1(nrow,&lens));
1497 
1498     /* Compute lens from local part of C */
1499     for (j=0; j<nrow; j++) {
1500       row  = irow[j];
1501       proc = row2proc[j];
1502       if (proc == rank) {
1503         /* diagonal part A = c->A */
1504         ncols = ai[row-rstart+1] - ai[row-rstart];
1505         cols  = aj + ai[row-rstart];
1506         if (!allcolumns) {
1507           for (k=0; k<ncols; k++) {
1508 #if defined(PETSC_USE_CTABLE)
1509             tcol = cmap_loc[cols[k]];
1510 #else
1511             tcol = cmap[cols[k]+cstart];
1512 #endif
1513             if (tcol) lens[j]++;
1514           }
1515         } else { /* allcolumns */
1516           lens[j] = ncols;
1517         }
1518 
1519         /* off-diagonal part B = c->B */
1520         ncols = bi[row-rstart+1] - bi[row-rstart];
1521         cols  = bj + bi[row-rstart];
1522         if (!allcolumns) {
1523           for (k=0; k<ncols; k++) {
1524 #if defined(PETSC_USE_CTABLE)
1525             PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol));
1526 #else
1527             tcol = cmap[bmap[cols[k]]];
1528 #endif
1529             if (tcol) lens[j]++;
1530           }
1531         } else { /* allcolumns */
1532           lens[j] += ncols;
1533         }
1534       }
1535     }
1536 
1537     /* Create row map (rmap): global row of C -> local row of submat */
1538 #if defined(PETSC_USE_CTABLE)
1539     PetscCall(PetscTableCreate(nrow,C->rmap->N,&rmap));
1540     for (j=0; j<nrow; j++) {
1541       row  = irow[j];
1542       proc = row2proc[j];
1543       if (proc == rank) { /* a local row */
1544         rmap_loc[row - rstart] = j;
1545       } else {
1546         PetscCall(PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES));
1547       }
1548     }
1549 #else
1550     PetscCall(PetscCalloc1(C->rmap->N,&rmap));
1551     for (j=0; j<nrow; j++) {
1552       rmap[irow[j]] = j;
1553     }
1554 #endif
1555 
1556     /* Update lens from offproc data */
1557     /* recv a->j is done */
1558     PetscCallMPI(MPI_Waitall(nrqs,r_waits3,r_status3));
1559     for (i=0; i<nrqs; i++) {
1560       proc    = pa[i];
1561       sbuf1_i = sbuf1[proc];
1562       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1563       ct1     = 2 + 1;
1564       ct2     = 0;
1565       rbuf2_i = rbuf2[i]; /* received length of C->j */
1566       rbuf3_i = rbuf3[i]; /* received C->j */
1567 
1568       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1569       max1   = sbuf1_i[2];
1570       for (k=0; k<max1; k++,ct1++) {
1571 #if defined(PETSC_USE_CTABLE)
1572         PetscCall(PetscTableFind(rmap,sbuf1_i[ct1]+1,&row));
1573         row--;
1574         PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1575 #else
1576         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1577 #endif
1578         /* Now, store row index of submat in sbuf1_i[ct1] */
1579         sbuf1_i[ct1] = row;
1580 
1581         nnz = rbuf2_i[ct1];
1582         if (!allcolumns) {
1583           for (l=0; l<nnz; l++,ct2++) {
1584 #if defined(PETSC_USE_CTABLE)
1585             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1586               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1587             } else {
1588               PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol));
1589             }
1590 #else
1591             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1592 #endif
1593             if (tcol) lens[row]++;
1594           }
1595         } else { /* allcolumns */
1596           lens[row] += nnz;
1597         }
1598       }
1599     }
1600     PetscCallMPI(MPI_Waitall(nrqr,s_waits3,s_status3));
1601     PetscCall(PetscFree4(r_waits3,s_waits3,r_status3,s_status3));
1602 
1603     /* Create the submatrices */
1604     PetscCall(MatCreate(PETSC_COMM_SELF,&submat));
1605     PetscCall(MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE));
1606 
1607     PetscCall(ISGetBlockSize(isrow[0],&i));
1608     PetscCall(ISGetBlockSize(iscol[0],&j));
1609     PetscCall(MatSetBlockSizes(submat,i,j));
1610     PetscCall(MatSetType(submat,((PetscObject)A)->type_name));
1611     PetscCall(MatSeqAIJSetPreallocation(submat,0,lens));
1612 
1613     /* create struct Mat_SubSppt and attached it to submat */
1614     PetscCall(PetscNew(&smatis1));
1615     subc = (Mat_SeqAIJ*)submat->data;
1616     subc->submatis1 = smatis1;
1617 
1618     smatis1->id          = 0;
1619     smatis1->nrqs        = nrqs;
1620     smatis1->nrqr        = nrqr;
1621     smatis1->rbuf1       = rbuf1;
1622     smatis1->rbuf2       = rbuf2;
1623     smatis1->rbuf3       = rbuf3;
1624     smatis1->sbuf2       = sbuf2;
1625     smatis1->req_source2 = req_source2;
1626 
1627     smatis1->sbuf1       = sbuf1;
1628     smatis1->ptr         = ptr;
1629     smatis1->tmp         = tmp;
1630     smatis1->ctr         = ctr;
1631 
1632     smatis1->pa           = pa;
1633     smatis1->req_size     = req_size;
1634     smatis1->req_source1  = req_source1;
1635 
1636     smatis1->allcolumns  = allcolumns;
1637     smatis1->singleis    = PETSC_TRUE;
1638     smatis1->row2proc    = row2proc;
1639     smatis1->rmap        = rmap;
1640     smatis1->cmap        = cmap;
1641 #if defined(PETSC_USE_CTABLE)
1642     smatis1->rmap_loc    = rmap_loc;
1643     smatis1->cmap_loc    = cmap_loc;
1644 #endif
1645 
1646     smatis1->destroy     = submat->ops->destroy;
1647     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1648     submat->factortype   = C->factortype;
1649 
1650     /* compute rmax */
1651     rmax = 0;
1652     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1653 
1654   } else { /* scall == MAT_REUSE_MATRIX */
1655     submat = submats[0];
1656     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1657 
1658     subc    = (Mat_SeqAIJ*)submat->data;
1659     rmax    = subc->rmax;
1660     smatis1 = subc->submatis1;
1661     nrqs        = smatis1->nrqs;
1662     nrqr        = smatis1->nrqr;
1663     rbuf1       = smatis1->rbuf1;
1664     rbuf2       = smatis1->rbuf2;
1665     rbuf3       = smatis1->rbuf3;
1666     req_source2 = smatis1->req_source2;
1667 
1668     sbuf1     = smatis1->sbuf1;
1669     sbuf2     = smatis1->sbuf2;
1670     ptr       = smatis1->ptr;
1671     tmp       = smatis1->tmp;
1672     ctr       = smatis1->ctr;
1673 
1674     pa         = smatis1->pa;
1675     req_size   = smatis1->req_size;
1676     req_source1 = smatis1->req_source1;
1677 
1678     allcolumns = smatis1->allcolumns;
1679     row2proc   = smatis1->row2proc;
1680     rmap       = smatis1->rmap;
1681     cmap       = smatis1->cmap;
1682 #if defined(PETSC_USE_CTABLE)
1683     rmap_loc   = smatis1->rmap_loc;
1684     cmap_loc   = smatis1->cmap_loc;
1685 #endif
1686   }
1687 
1688   /* Post recv matrix values */
1689   PetscCall(PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals));
1690   PetscCall(PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4));
1691   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4));
1692   for (i=0; i<nrqs; ++i) {
1693     PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i]));
1694     PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i));
1695   }
1696 
1697   /* Allocate sending buffers for a->a, and send them off */
1698   PetscCall(PetscMalloc1(nrqr,&sbuf_aa));
1699   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1700   if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0]));
1701   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1702 
1703   for (i=0; i<nrqr; i++) {
1704     rbuf1_i   = rbuf1[i];
1705     sbuf_aa_i = sbuf_aa[i];
1706     ct1       = 2*rbuf1_i[0]+1;
1707     ct2       = 0;
1708     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1709 
1710     kmax = rbuf1_i[2];
1711     for (k=0; k<kmax; k++,ct1++) {
1712       row = rbuf1_i[ct1] - rstart;
1713       nzA = ai[row+1] - ai[row];
1714       nzB = bi[row+1] - bi[row];
1715       ncols  = nzA + nzB;
1716       cworkB = bj + bi[row];
1717       vworkA = a_a + ai[row];
1718       vworkB = b_a + bi[row];
1719 
1720       /* load the column values for this row into vals*/
1721       vals = sbuf_aa_i + ct2;
1722 
1723       lwrite = 0;
1724       for (l=0; l<nzB; l++) {
1725         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1726       }
1727       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1728       for (l=0; l<nzB; l++) {
1729         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1730       }
1731 
1732       ct2 += ncols;
1733     }
1734     PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i));
1735   }
1736 
1737   /* Assemble submat */
1738   /* First assemble the local rows */
1739   for (j=0; j<nrow; j++) {
1740     row  = irow[j];
1741     proc = row2proc[j];
1742     if (proc == rank) {
1743       Crow = row - rstart;  /* local row index of C */
1744 #if defined(PETSC_USE_CTABLE)
1745       row = rmap_loc[Crow]; /* row index of submat */
1746 #else
1747       row = rmap[row];
1748 #endif
1749 
1750       if (allcolumns) {
1751         /* diagonal part A = c->A */
1752         ncols = ai[Crow+1] - ai[Crow];
1753         cols  = aj + ai[Crow];
1754         vals  = a_a + ai[Crow];
1755         i     = 0;
1756         for (k=0; k<ncols; k++) {
1757           subcols[i]   = cols[k] + cstart;
1758           subvals[i++] = vals[k];
1759         }
1760 
1761         /* off-diagonal part B = c->B */
1762         ncols = bi[Crow+1] - bi[Crow];
1763         cols  = bj + bi[Crow];
1764         vals  = b_a + bi[Crow];
1765         for (k=0; k<ncols; k++) {
1766           subcols[i]   = bmap[cols[k]];
1767           subvals[i++] = vals[k];
1768         }
1769 
1770         PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES));
1771 
1772       } else { /* !allcolumns */
1773 #if defined(PETSC_USE_CTABLE)
1774         /* diagonal part A = c->A */
1775         ncols = ai[Crow+1] - ai[Crow];
1776         cols  = aj + ai[Crow];
1777         vals  = a_a + ai[Crow];
1778         i     = 0;
1779         for (k=0; k<ncols; k++) {
1780           tcol = cmap_loc[cols[k]];
1781           if (tcol) {
1782             subcols[i]   = --tcol;
1783             subvals[i++] = vals[k];
1784           }
1785         }
1786 
1787         /* off-diagonal part B = c->B */
1788         ncols = bi[Crow+1] - bi[Crow];
1789         cols  = bj + bi[Crow];
1790         vals  = b_a + bi[Crow];
1791         for (k=0; k<ncols; k++) {
1792           PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol));
1793           if (tcol) {
1794             subcols[i]   = --tcol;
1795             subvals[i++] = vals[k];
1796           }
1797         }
1798 #else
1799         /* diagonal part A = c->A */
1800         ncols = ai[Crow+1] - ai[Crow];
1801         cols  = aj + ai[Crow];
1802         vals  = a_a + ai[Crow];
1803         i     = 0;
1804         for (k=0; k<ncols; k++) {
1805           tcol = cmap[cols[k]+cstart];
1806           if (tcol) {
1807             subcols[i]   = --tcol;
1808             subvals[i++] = vals[k];
1809           }
1810         }
1811 
1812         /* off-diagonal part B = c->B */
1813         ncols = bi[Crow+1] - bi[Crow];
1814         cols  = bj + bi[Crow];
1815         vals  = b_a + bi[Crow];
1816         for (k=0; k<ncols; k++) {
1817           tcol = cmap[bmap[cols[k]]];
1818           if (tcol) {
1819             subcols[i]   = --tcol;
1820             subvals[i++] = vals[k];
1821           }
1822         }
1823 #endif
1824         PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES));
1825       }
1826     }
1827   }
1828 
1829   /* Now assemble the off-proc rows */
1830   for (i=0; i<nrqs; i++) { /* for each requested message */
1831     /* recv values from other processes */
1832     PetscCallMPI(MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i));
1833     proc    = pa[idex];
1834     sbuf1_i = sbuf1[proc];
1835     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1836     ct1     = 2 + 1;
1837     ct2     = 0; /* count of received C->j */
1838     ct3     = 0; /* count of received C->j that will be inserted into submat */
1839     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1840     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1841     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1842 
1843     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1844     max1 = sbuf1_i[2];             /* num of rows */
1845     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1846       row = sbuf1_i[ct1]; /* row index of submat */
1847       if (!allcolumns) {
1848         idex = 0;
1849         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1850           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1851           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1852 #if defined(PETSC_USE_CTABLE)
1853             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1854               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1855             } else {
1856               PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol));
1857             }
1858 #else
1859             tcol = cmap[rbuf3_i[ct2]];
1860 #endif
1861             if (tcol) {
1862               subcols[idex]   = --tcol; /* may not be sorted */
1863               subvals[idex++] = rbuf4_i[ct2];
1864 
1865               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1866                For reuse, we replace received C->j with index that should be inserted to submat */
1867               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1868             }
1869           }
1870           PetscCall(MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES));
1871         } else { /* scall == MAT_REUSE_MATRIX */
1872           submat = submats[0];
1873           subc   = (Mat_SeqAIJ*)submat->data;
1874 
1875           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1876           for (l=0; l<nnz; l++) {
1877             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1878             subvals[idex++] = rbuf4_i[ct2];
1879           }
1880 
1881           bj = subc->j + subc->i[row]; /* sorted column indices */
1882           PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES));
1883         }
1884       } else { /* allcolumns */
1885         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1886         PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES));
1887         ct2 += nnz;
1888       }
1889     }
1890   }
1891 
1892   /* sending a->a are done */
1893   PetscCallMPI(MPI_Waitall(nrqr,s_waits4,s_status4));
1894   PetscCall(PetscFree4(r_waits4,s_waits4,r_status4,s_status4));
1895 
1896   PetscCall(MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY));
1897   PetscCall(MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY));
1898   submats[0] = submat;
1899 
1900   /* Restore the indices */
1901   PetscCall(ISRestoreIndices(isrow[0],&irow));
1902   if (!allcolumns) {
1903     PetscCall(ISRestoreIndices(iscol[0],&icol));
1904   }
1905 
1906   /* Destroy allocated memory */
1907   for (i=0; i<nrqs; ++i) {
1908     PetscCall(PetscFree(rbuf4[i]));
1909   }
1910   PetscCall(PetscFree3(rbuf4,subcols,subvals));
1911   if (sbuf_aa) {
1912     PetscCall(PetscFree(sbuf_aa[0]));
1913     PetscCall(PetscFree(sbuf_aa));
1914   }
1915 
1916   if (scall == MAT_INITIAL_MATRIX) {
1917     PetscCall(PetscFree(lens));
1918     if (sbuf_aj) {
1919       PetscCall(PetscFree(sbuf_aj[0]));
1920       PetscCall(PetscFree(sbuf_aj));
1921     }
1922   }
1923   PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a));
1924   PetscCall(MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a));
1925   PetscFunctionReturn(0);
1926 }
1927 
1928 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1929 {
1930   PetscInt       ncol;
1931   PetscBool      colflag,allcolumns=PETSC_FALSE;
1932 
1933   PetscFunctionBegin;
1934   /* Allocate memory to hold all the submatrices */
1935   if (scall == MAT_INITIAL_MATRIX) {
1936     PetscCall(PetscCalloc1(2,submat));
1937   }
1938 
1939   /* Check for special case: each processor gets entire matrix columns */
1940   PetscCall(ISIdentity(iscol[0],&colflag));
1941   PetscCall(ISGetLocalSize(iscol[0],&ncol));
1942   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1943 
1944   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat));
1945   PetscFunctionReturn(0);
1946 }
1947 
1948 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1949 {
1950   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1951   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1952   Mat_SeqAIJ     *subc;
1953   Mat_SubSppt    *smat;
1954 
1955   PetscFunctionBegin;
1956   /* Check for special case: each processor has a single IS */
1957   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1958     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat));
1959     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1960     PetscFunctionReturn(0);
1961   }
1962 
1963   /* Collect global wantallmatrix and nstages */
1964   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1965   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1966   if (!nmax) nmax = 1;
1967 
1968   if (scall == MAT_INITIAL_MATRIX) {
1969     /* Collect global wantallmatrix and nstages */
1970     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1971       PetscCall(ISIdentity(*isrow,&rowflag));
1972       PetscCall(ISIdentity(*iscol,&colflag));
1973       PetscCall(ISGetLocalSize(*isrow,&nrow));
1974       PetscCall(ISGetLocalSize(*iscol,&ncol));
1975       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1976         wantallmatrix = PETSC_TRUE;
1977 
1978         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL));
1979       }
1980     }
1981 
1982     /* Determine the number of stages through which submatrices are done
1983        Each stage will extract nmax submatrices.
1984        nmax is determined by the matrix column dimension.
1985        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1986     */
1987     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1988 
1989     in[0] = -1*(PetscInt)wantallmatrix;
1990     in[1] = nstages;
1991     PetscCall(MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
1992     wantallmatrix = (PetscBool)(-out[0]);
1993     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1994 
1995   } else { /* MAT_REUSE_MATRIX */
1996     if (ismax) {
1997       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
1998       smat = subc->submatis1;
1999     } else { /* (*submat)[0] is a dummy matrix */
2000       smat = (Mat_SubSppt*)(*submat)[0]->data;
2001     }
2002     if (!smat) {
2003       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2004       wantallmatrix = PETSC_TRUE;
2005     } else if (smat->singleis) {
2006       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat));
2007       PetscFunctionReturn(0);
2008     } else {
2009       nstages = smat->nstages;
2010     }
2011   }
2012 
2013   if (wantallmatrix) {
2014     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat));
2015     PetscFunctionReturn(0);
2016   }
2017 
2018   /* Allocate memory to hold all the submatrices and dummy submatrices */
2019   if (scall == MAT_INITIAL_MATRIX) {
2020     PetscCall(PetscCalloc1(ismax+nstages,submat));
2021   }
2022 
2023   for (i=0,pos=0; i<nstages; i++) {
2024     if (pos+nmax <= ismax) max_no = nmax;
2025     else if (pos >= ismax) max_no = 0;
2026     else                   max_no = ismax-pos;
2027 
2028     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos));
2029     if (!max_no) {
2030       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2031         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2032         smat->nstages = nstages;
2033       }
2034       pos++; /* advance to next dummy matrix if any */
2035     } else pos += max_no;
2036   }
2037 
2038   if (ismax && scall == MAT_INITIAL_MATRIX) {
2039     /* save nstages for reuse */
2040     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2041     smat = subc->submatis1;
2042     smat->nstages = nstages;
2043   }
2044   PetscFunctionReturn(0);
2045 }
2046 
2047 /* -------------------------------------------------------------------------*/
2048 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2049 {
2050   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2051   Mat            A  = c->A;
2052   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2053   const PetscInt **icol,**irow;
2054   PetscInt       *nrow,*ncol,start;
2055   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2056   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2057   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2058   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2059   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2060 #if defined(PETSC_USE_CTABLE)
2061   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2062 #else
2063   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2064 #endif
2065   const PetscInt *irow_i;
2066   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2067   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2068   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2069   MPI_Comm       comm;
2070   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2071   PetscMPIInt    *onodes1,*olengths1,end;
2072   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2073   Mat_SubSppt    *smat_i;
2074   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2075   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2076 
2077   PetscFunctionBegin;
2078   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
2079   size = c->size;
2080   rank = c->rank;
2081 
2082   PetscCall(PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns));
2083   PetscCall(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted));
2084 
2085   for (i=0; i<ismax; i++) {
2086     PetscCall(ISSorted(iscol[i],&issorted[i]));
2087     if (!issorted[i]) iscsorted = issorted[i];
2088 
2089     PetscCall(ISSorted(isrow[i],&issorted[i]));
2090 
2091     PetscCall(ISGetIndices(isrow[i],&irow[i]));
2092     PetscCall(ISGetLocalSize(isrow[i],&nrow[i]));
2093 
2094     /* Check for special case: allcolumn */
2095     PetscCall(ISIdentity(iscol[i],&colflag));
2096     PetscCall(ISGetLocalSize(iscol[i],&ncol[i]));
2097     if (colflag && ncol[i] == C->cmap->N) {
2098       allcolumns[i] = PETSC_TRUE;
2099       icol[i] = NULL;
2100     } else {
2101       allcolumns[i] = PETSC_FALSE;
2102       PetscCall(ISGetIndices(iscol[i],&icol[i]));
2103     }
2104   }
2105 
2106   if (scall == MAT_REUSE_MATRIX) {
2107     /* Assumes new rows are same length as the old rows */
2108     for (i=0; i<ismax; i++) {
2109       PetscCheck(submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i);
2110       subc = (Mat_SeqAIJ*)submats[i]->data;
2111       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2112 
2113       /* Initial matrix as if empty */
2114       PetscCall(PetscArrayzero(subc->ilen,submats[i]->rmap->n));
2115 
2116       smat_i   = subc->submatis1;
2117 
2118       nrqs        = smat_i->nrqs;
2119       nrqr        = smat_i->nrqr;
2120       rbuf1       = smat_i->rbuf1;
2121       rbuf2       = smat_i->rbuf2;
2122       rbuf3       = smat_i->rbuf3;
2123       req_source2 = smat_i->req_source2;
2124 
2125       sbuf1     = smat_i->sbuf1;
2126       sbuf2     = smat_i->sbuf2;
2127       ptr       = smat_i->ptr;
2128       tmp       = smat_i->tmp;
2129       ctr       = smat_i->ctr;
2130 
2131       pa          = smat_i->pa;
2132       req_size    = smat_i->req_size;
2133       req_source1 = smat_i->req_source1;
2134 
2135       allcolumns[i] = smat_i->allcolumns;
2136       row2proc[i]   = smat_i->row2proc;
2137       rmap[i]       = smat_i->rmap;
2138       cmap[i]       = smat_i->cmap;
2139     }
2140 
2141     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2142       PetscCheck(submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2143       smat_i = (Mat_SubSppt*)submats[0]->data;
2144 
2145       nrqs        = smat_i->nrqs;
2146       nrqr        = smat_i->nrqr;
2147       rbuf1       = smat_i->rbuf1;
2148       rbuf2       = smat_i->rbuf2;
2149       rbuf3       = smat_i->rbuf3;
2150       req_source2 = smat_i->req_source2;
2151 
2152       sbuf1       = smat_i->sbuf1;
2153       sbuf2       = smat_i->sbuf2;
2154       ptr         = smat_i->ptr;
2155       tmp         = smat_i->tmp;
2156       ctr         = smat_i->ctr;
2157 
2158       pa          = smat_i->pa;
2159       req_size    = smat_i->req_size;
2160       req_source1 = smat_i->req_source1;
2161 
2162       allcolumns[0] = PETSC_FALSE;
2163     }
2164   } else { /* scall == MAT_INITIAL_MATRIX */
2165     /* Get some new tags to keep the communication clean */
2166     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
2167     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3));
2168 
2169     /* evaluate communication - mesg to who, length of mesg, and buffer space
2170      required. Based on this, buffers are allocated, and data copied into them*/
2171     PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));   /* mesg size, initialize work vectors */
2172 
2173     for (i=0; i<ismax; i++) {
2174       jmax   = nrow[i];
2175       irow_i = irow[i];
2176 
2177       PetscCall(PetscMalloc1(jmax,&row2proc_i));
2178       row2proc[i] = row2proc_i;
2179 
2180       if (issorted[i]) proc = 0;
2181       for (j=0; j<jmax; j++) {
2182         if (!issorted[i]) proc = 0;
2183         row = irow_i[j];
2184         while (row >= C->rmap->range[proc+1]) proc++;
2185         w4[proc]++;
2186         row2proc_i[j] = proc; /* map row index to proc */
2187       }
2188       for (j=0; j<size; j++) {
2189         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2190       }
2191     }
2192 
2193     nrqs     = 0;              /* no of outgoing messages */
2194     msz      = 0;              /* total mesg length (for all procs) */
2195     w1[rank] = 0;              /* no mesg sent to self */
2196     w3[rank] = 0;
2197     for (i=0; i<size; i++) {
2198       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2199     }
2200     PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/
2201     for (i=0,j=0; i<size; i++) {
2202       if (w1[i]) { pa[j] = i; j++; }
2203     }
2204 
2205     /* Each message would have a header = 1 + 2*(no of IS) + data */
2206     for (i=0; i<nrqs; i++) {
2207       j      = pa[i];
2208       w1[j] += w2[j] + 2* w3[j];
2209       msz   += w1[j];
2210     }
2211     PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz));
2212 
2213     /* Determine the number of messages to expect, their lengths, from from-ids */
2214     PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
2215     PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
2216 
2217     /* Now post the Irecvs corresponding to these messages */
2218     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag0));
2219     PetscCall(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1));
2220 
2221     /* Allocate Memory for outgoing messages */
2222     PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
2223     PetscCall(PetscArrayzero(sbuf1,size));
2224     PetscCall(PetscArrayzero(ptr,size));
2225 
2226     {
2227       PetscInt *iptr = tmp;
2228       k    = 0;
2229       for (i=0; i<nrqs; i++) {
2230         j        = pa[i];
2231         iptr    += k;
2232         sbuf1[j] = iptr;
2233         k        = w1[j];
2234       }
2235     }
2236 
2237     /* Form the outgoing messages. Initialize the header space */
2238     for (i=0; i<nrqs; i++) {
2239       j           = pa[i];
2240       sbuf1[j][0] = 0;
2241       PetscCall(PetscArrayzero(sbuf1[j]+1,2*w3[j]));
2242       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2243     }
2244 
2245     /* Parse the isrow and copy data into outbuf */
2246     for (i=0; i<ismax; i++) {
2247       row2proc_i = row2proc[i];
2248       PetscCall(PetscArrayzero(ctr,size));
2249       irow_i = irow[i];
2250       jmax   = nrow[i];
2251       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2252         proc = row2proc_i[j];
2253         if (proc != rank) { /* copy to the outgoing buf*/
2254           ctr[proc]++;
2255           *ptr[proc] = irow_i[j];
2256           ptr[proc]++;
2257         }
2258       }
2259       /* Update the headers for the current IS */
2260       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2261         if ((ctr_j = ctr[j])) {
2262           sbuf1_j        = sbuf1[j];
2263           k              = ++sbuf1_j[0];
2264           sbuf1_j[2*k]   = ctr_j;
2265           sbuf1_j[2*k-1] = i;
2266         }
2267       }
2268     }
2269 
2270     /*  Now  post the sends */
2271     PetscCall(PetscMalloc1(nrqs,&s_waits1));
2272     for (i=0; i<nrqs; ++i) {
2273       j    = pa[i];
2274       PetscCallMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i));
2275     }
2276 
2277     /* Post Receives to capture the buffer size */
2278     PetscCall(PetscMalloc1(nrqs,&r_waits2));
2279     PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3));
2280     if (nrqs) rbuf2[0] = tmp + msz;
2281     for (i=1; i<nrqs; ++i) {
2282       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2283     }
2284     for (i=0; i<nrqs; ++i) {
2285       j    = pa[i];
2286       PetscCallMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i));
2287     }
2288 
2289     /* Send to other procs the buf size they should allocate */
2290     /* Receive messages*/
2291     PetscCall(PetscMalloc1(nrqr,&s_waits2));
2292     PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1));
2293     {
2294       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2295       PetscInt   *sbuf2_i;
2296 
2297       PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE));
2298       for (i=0; i<nrqr; ++i) {
2299         req_size[i] = 0;
2300         rbuf1_i        = rbuf1[i];
2301         start          = 2*rbuf1_i[0] + 1;
2302         end            = olengths1[i];
2303         PetscCall(PetscMalloc1(end,&sbuf2[i]));
2304         sbuf2_i        = sbuf2[i];
2305         for (j=start; j<end; j++) {
2306           id              = rbuf1_i[j] - rstart;
2307           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2308           sbuf2_i[j]      = ncols;
2309           req_size[i] += ncols;
2310         }
2311         req_source1[i] = onodes1[i];
2312         /* form the header */
2313         sbuf2_i[0] = req_size[i];
2314         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2315 
2316         PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i));
2317       }
2318     }
2319 
2320     PetscCall(PetscFree(onodes1));
2321     PetscCall(PetscFree(olengths1));
2322     PetscCall(PetscFree(r_waits1));
2323     PetscCall(PetscFree4(w1,w2,w3,w4));
2324 
2325     /* Receive messages*/
2326     PetscCall(PetscMalloc1(nrqs,&r_waits3));
2327     PetscCallMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE));
2328     for (i=0; i<nrqs; ++i) {
2329       PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i]));
2330       req_source2[i] = pa[i];
2331       PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i));
2332     }
2333     PetscCall(PetscFree(r_waits2));
2334 
2335     /* Wait on sends1 and sends2 */
2336     PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
2337     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
2338     PetscCall(PetscFree(s_waits1));
2339     PetscCall(PetscFree(s_waits2));
2340 
2341     /* Now allocate sending buffers for a->j, and send them off */
2342     PetscCall(PetscMalloc1(nrqr,&sbuf_aj));
2343     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2344     if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0]));
2345     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2346 
2347     PetscCall(PetscMalloc1(nrqr,&s_waits3));
2348     {
2349       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2350       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2351       PetscInt cend = C->cmap->rend;
2352       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2353 
2354       for (i=0; i<nrqr; i++) {
2355         rbuf1_i   = rbuf1[i];
2356         sbuf_aj_i = sbuf_aj[i];
2357         ct1       = 2*rbuf1_i[0] + 1;
2358         ct2       = 0;
2359         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2360           kmax = rbuf1[i][2*j];
2361           for (k=0; k<kmax; k++,ct1++) {
2362             row    = rbuf1_i[ct1] - rstart;
2363             nzA    = a_i[row+1] - a_i[row];
2364             nzB    = b_i[row+1] - b_i[row];
2365             ncols  = nzA + nzB;
2366             cworkA = a_j + a_i[row];
2367             cworkB = b_j + b_i[row];
2368 
2369             /* load the column indices for this row into cols */
2370             cols = sbuf_aj_i + ct2;
2371 
2372             lwrite = 0;
2373             for (l=0; l<nzB; l++) {
2374               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2375             }
2376             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2377             for (l=0; l<nzB; l++) {
2378               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2379             }
2380 
2381             ct2 += ncols;
2382           }
2383         }
2384         PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i));
2385       }
2386     }
2387 
2388     /* create col map: global col of C -> local col of submatrices */
2389     {
2390       const PetscInt *icol_i;
2391 #if defined(PETSC_USE_CTABLE)
2392       for (i=0; i<ismax; i++) {
2393         if (!allcolumns[i]) {
2394           PetscCall(PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]));
2395 
2396           jmax   = ncol[i];
2397           icol_i = icol[i];
2398           cmap_i = cmap[i];
2399           for (j=0; j<jmax; j++) {
2400             PetscCall(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES));
2401           }
2402         } else cmap[i] = NULL;
2403       }
2404 #else
2405       for (i=0; i<ismax; i++) {
2406         if (!allcolumns[i]) {
2407           PetscCall(PetscCalloc1(C->cmap->N,&cmap[i]));
2408           jmax   = ncol[i];
2409           icol_i = icol[i];
2410           cmap_i = cmap[i];
2411           for (j=0; j<jmax; j++) {
2412             cmap_i[icol_i[j]] = j+1;
2413           }
2414         } else cmap[i] = NULL;
2415       }
2416 #endif
2417     }
2418 
2419     /* Create lens which is required for MatCreate... */
2420     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2421     PetscCall(PetscMalloc1(ismax,&lens));
2422 
2423     if (ismax) {
2424       PetscCall(PetscCalloc1(j,&lens[0]));
2425     }
2426     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2427 
2428     /* Update lens from local data */
2429     for (i=0; i<ismax; i++) {
2430       row2proc_i = row2proc[i];
2431       jmax = nrow[i];
2432       if (!allcolumns[i]) cmap_i = cmap[i];
2433       irow_i = irow[i];
2434       lens_i = lens[i];
2435       for (j=0; j<jmax; j++) {
2436         row = irow_i[j];
2437         proc = row2proc_i[j];
2438         if (proc == rank) {
2439           PetscCall(MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL));
2440           if (!allcolumns[i]) {
2441             for (k=0; k<ncols; k++) {
2442 #if defined(PETSC_USE_CTABLE)
2443               PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol));
2444 #else
2445               tcol = cmap_i[cols[k]];
2446 #endif
2447               if (tcol) lens_i[j]++;
2448             }
2449           } else { /* allcolumns */
2450             lens_i[j] = ncols;
2451           }
2452           PetscCall(MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL));
2453         }
2454       }
2455     }
2456 
2457     /* Create row map: global row of C -> local row of submatrices */
2458 #if defined(PETSC_USE_CTABLE)
2459     for (i=0; i<ismax; i++) {
2460       PetscCall(PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]));
2461       irow_i = irow[i];
2462       jmax   = nrow[i];
2463       for (j=0; j<jmax; j++) {
2464       PetscCall(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES));
2465       }
2466     }
2467 #else
2468     for (i=0; i<ismax; i++) {
2469       PetscCall(PetscCalloc1(C->rmap->N,&rmap[i]));
2470       rmap_i = rmap[i];
2471       irow_i = irow[i];
2472       jmax   = nrow[i];
2473       for (j=0; j<jmax; j++) {
2474         rmap_i[irow_i[j]] = j;
2475       }
2476     }
2477 #endif
2478 
2479     /* Update lens from offproc data */
2480     {
2481       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2482 
2483       PetscCallMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE));
2484       for (tmp2=0; tmp2<nrqs; tmp2++) {
2485         sbuf1_i = sbuf1[pa[tmp2]];
2486         jmax    = sbuf1_i[0];
2487         ct1     = 2*jmax+1;
2488         ct2     = 0;
2489         rbuf2_i = rbuf2[tmp2];
2490         rbuf3_i = rbuf3[tmp2];
2491         for (j=1; j<=jmax; j++) {
2492           is_no  = sbuf1_i[2*j-1];
2493           max1   = sbuf1_i[2*j];
2494           lens_i = lens[is_no];
2495           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2496           rmap_i = rmap[is_no];
2497           for (k=0; k<max1; k++,ct1++) {
2498 #if defined(PETSC_USE_CTABLE)
2499             PetscCall(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row));
2500             row--;
2501             PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2502 #else
2503             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2504 #endif
2505             max2 = rbuf2_i[ct1];
2506             for (l=0; l<max2; l++,ct2++) {
2507               if (!allcolumns[is_no]) {
2508 #if defined(PETSC_USE_CTABLE)
2509                 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
2510 #else
2511                 tcol = cmap_i[rbuf3_i[ct2]];
2512 #endif
2513                 if (tcol) lens_i[row]++;
2514               } else { /* allcolumns */
2515                 lens_i[row]++; /* lens_i[row] += max2 ? */
2516               }
2517             }
2518           }
2519         }
2520       }
2521     }
2522     PetscCall(PetscFree(r_waits3));
2523     PetscCallMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE));
2524     PetscCall(PetscFree(s_waits3));
2525 
2526     /* Create the submatrices */
2527     for (i=0; i<ismax; i++) {
2528       PetscInt    rbs,cbs;
2529 
2530       PetscCall(ISGetBlockSize(isrow[i],&rbs));
2531       PetscCall(ISGetBlockSize(iscol[i],&cbs));
2532 
2533       PetscCall(MatCreate(PETSC_COMM_SELF,submats+i));
2534       PetscCall(MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE));
2535 
2536       PetscCall(MatSetBlockSizes(submats[i],rbs,cbs));
2537       PetscCall(MatSetType(submats[i],((PetscObject)A)->type_name));
2538       PetscCall(MatSeqAIJSetPreallocation(submats[i],0,lens[i]));
2539 
2540       /* create struct Mat_SubSppt and attached it to submat */
2541       PetscCall(PetscNew(&smat_i));
2542       subc = (Mat_SeqAIJ*)submats[i]->data;
2543       subc->submatis1 = smat_i;
2544 
2545       smat_i->destroy          = submats[i]->ops->destroy;
2546       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2547       submats[i]->factortype   = C->factortype;
2548 
2549       smat_i->id          = i;
2550       smat_i->nrqs        = nrqs;
2551       smat_i->nrqr        = nrqr;
2552       smat_i->rbuf1       = rbuf1;
2553       smat_i->rbuf2       = rbuf2;
2554       smat_i->rbuf3       = rbuf3;
2555       smat_i->sbuf2       = sbuf2;
2556       smat_i->req_source2 = req_source2;
2557 
2558       smat_i->sbuf1       = sbuf1;
2559       smat_i->ptr         = ptr;
2560       smat_i->tmp         = tmp;
2561       smat_i->ctr         = ctr;
2562 
2563       smat_i->pa           = pa;
2564       smat_i->req_size     = req_size;
2565       smat_i->req_source1  = req_source1;
2566 
2567       smat_i->allcolumns  = allcolumns[i];
2568       smat_i->singleis    = PETSC_FALSE;
2569       smat_i->row2proc    = row2proc[i];
2570       smat_i->rmap        = rmap[i];
2571       smat_i->cmap        = cmap[i];
2572     }
2573 
2574     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2575       PetscCall(MatCreate(PETSC_COMM_SELF,&submats[0]));
2576       PetscCall(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE));
2577       PetscCall(MatSetType(submats[0],MATDUMMY));
2578 
2579       /* create struct Mat_SubSppt and attached it to submat */
2580       PetscCall(PetscNewLog(submats[0],&smat_i));
2581       submats[0]->data = (void*)smat_i;
2582 
2583       smat_i->destroy          = submats[0]->ops->destroy;
2584       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2585       submats[0]->factortype   = C->factortype;
2586 
2587       smat_i->id          = 0;
2588       smat_i->nrqs        = nrqs;
2589       smat_i->nrqr        = nrqr;
2590       smat_i->rbuf1       = rbuf1;
2591       smat_i->rbuf2       = rbuf2;
2592       smat_i->rbuf3       = rbuf3;
2593       smat_i->sbuf2       = sbuf2;
2594       smat_i->req_source2 = req_source2;
2595 
2596       smat_i->sbuf1       = sbuf1;
2597       smat_i->ptr         = ptr;
2598       smat_i->tmp         = tmp;
2599       smat_i->ctr         = ctr;
2600 
2601       smat_i->pa           = pa;
2602       smat_i->req_size     = req_size;
2603       smat_i->req_source1  = req_source1;
2604 
2605       smat_i->allcolumns  = PETSC_FALSE;
2606       smat_i->singleis    = PETSC_FALSE;
2607       smat_i->row2proc    = NULL;
2608       smat_i->rmap        = NULL;
2609       smat_i->cmap        = NULL;
2610     }
2611 
2612     if (ismax) PetscCall(PetscFree(lens[0]));
2613     PetscCall(PetscFree(lens));
2614     if (sbuf_aj) {
2615       PetscCall(PetscFree(sbuf_aj[0]));
2616       PetscCall(PetscFree(sbuf_aj));
2617     }
2618 
2619   } /* endof scall == MAT_INITIAL_MATRIX */
2620 
2621   /* Post recv matrix values */
2622   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4));
2623   PetscCall(PetscMalloc1(nrqs,&rbuf4));
2624   PetscCall(PetscMalloc1(nrqs,&r_waits4));
2625   for (i=0; i<nrqs; ++i) {
2626     PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i]));
2627     PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i));
2628   }
2629 
2630   /* Allocate sending buffers for a->a, and send them off */
2631   PetscCall(PetscMalloc1(nrqr,&sbuf_aa));
2632   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2633   if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0]));
2634   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2635 
2636   PetscCall(PetscMalloc1(nrqr,&s_waits4));
2637   {
2638     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2639     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2640     PetscInt    cend   = C->cmap->rend;
2641     PetscInt    *b_j   = b->j;
2642     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2643 
2644     PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a));
2645     PetscCall(MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a));
2646     for (i=0; i<nrqr; i++) {
2647       rbuf1_i   = rbuf1[i];
2648       sbuf_aa_i = sbuf_aa[i];
2649       ct1       = 2*rbuf1_i[0]+1;
2650       ct2       = 0;
2651       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2652         kmax = rbuf1_i[2*j];
2653         for (k=0; k<kmax; k++,ct1++) {
2654           row    = rbuf1_i[ct1] - rstart;
2655           nzA    = a_i[row+1] - a_i[row];
2656           nzB    = b_i[row+1] - b_i[row];
2657           ncols  = nzA + nzB;
2658           cworkB = b_j + b_i[row];
2659           vworkA = a_a + a_i[row];
2660           vworkB = b_a + b_i[row];
2661 
2662           /* load the column values for this row into vals*/
2663           vals = sbuf_aa_i+ct2;
2664 
2665           lwrite = 0;
2666           for (l=0; l<nzB; l++) {
2667             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2668           }
2669           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2670           for (l=0; l<nzB; l++) {
2671             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2672           }
2673 
2674           ct2 += ncols;
2675         }
2676       }
2677       PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i));
2678     }
2679     PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a));
2680     PetscCall(MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a));
2681   }
2682 
2683   /* Assemble the matrices */
2684   /* First assemble the local rows */
2685   for (i=0; i<ismax; i++) {
2686     row2proc_i = row2proc[i];
2687     subc      = (Mat_SeqAIJ*)submats[i]->data;
2688     imat_ilen = subc->ilen;
2689     imat_j    = subc->j;
2690     imat_i    = subc->i;
2691     imat_a    = subc->a;
2692 
2693     if (!allcolumns[i]) cmap_i = cmap[i];
2694     rmap_i = rmap[i];
2695     irow_i = irow[i];
2696     jmax   = nrow[i];
2697     for (j=0; j<jmax; j++) {
2698       row  = irow_i[j];
2699       proc = row2proc_i[j];
2700       if (proc == rank) {
2701         old_row = row;
2702 #if defined(PETSC_USE_CTABLE)
2703         PetscCall(PetscTableFind(rmap_i,row+1,&row));
2704         row--;
2705 #else
2706         row = rmap_i[row];
2707 #endif
2708         ilen_row = imat_ilen[row];
2709         PetscCall(MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals));
2710         mat_i    = imat_i[row];
2711         mat_a    = imat_a + mat_i;
2712         mat_j    = imat_j + mat_i;
2713         if (!allcolumns[i]) {
2714           for (k=0; k<ncols; k++) {
2715 #if defined(PETSC_USE_CTABLE)
2716             PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol));
2717 #else
2718             tcol = cmap_i[cols[k]];
2719 #endif
2720             if (tcol) {
2721               *mat_j++ = tcol - 1;
2722               *mat_a++ = vals[k];
2723               ilen_row++;
2724             }
2725           }
2726         } else { /* allcolumns */
2727           for (k=0; k<ncols; k++) {
2728             *mat_j++ = cols[k];  /* global col index! */
2729             *mat_a++ = vals[k];
2730             ilen_row++;
2731           }
2732         }
2733         PetscCall(MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals));
2734 
2735         imat_ilen[row] = ilen_row;
2736       }
2737     }
2738   }
2739 
2740   /* Now assemble the off proc rows */
2741   PetscCallMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE));
2742   for (tmp2=0; tmp2<nrqs; tmp2++) {
2743     sbuf1_i = sbuf1[pa[tmp2]];
2744     jmax    = sbuf1_i[0];
2745     ct1     = 2*jmax + 1;
2746     ct2     = 0;
2747     rbuf2_i = rbuf2[tmp2];
2748     rbuf3_i = rbuf3[tmp2];
2749     rbuf4_i = rbuf4[tmp2];
2750     for (j=1; j<=jmax; j++) {
2751       is_no     = sbuf1_i[2*j-1];
2752       rmap_i    = rmap[is_no];
2753       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2754       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2755       imat_ilen = subc->ilen;
2756       imat_j    = subc->j;
2757       imat_i    = subc->i;
2758       imat_a    = subc->a;
2759       max1      = sbuf1_i[2*j];
2760       for (k=0; k<max1; k++,ct1++) {
2761         row = sbuf1_i[ct1];
2762 #if defined(PETSC_USE_CTABLE)
2763         PetscCall(PetscTableFind(rmap_i,row+1,&row));
2764         row--;
2765 #else
2766         row = rmap_i[row];
2767 #endif
2768         ilen  = imat_ilen[row];
2769         mat_i = imat_i[row];
2770         mat_a = imat_a + mat_i;
2771         mat_j = imat_j + mat_i;
2772         max2  = rbuf2_i[ct1];
2773         if (!allcolumns[is_no]) {
2774           for (l=0; l<max2; l++,ct2++) {
2775 #if defined(PETSC_USE_CTABLE)
2776             PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
2777 #else
2778             tcol = cmap_i[rbuf3_i[ct2]];
2779 #endif
2780             if (tcol) {
2781               *mat_j++ = tcol - 1;
2782               *mat_a++ = rbuf4_i[ct2];
2783               ilen++;
2784             }
2785           }
2786         } else { /* allcolumns */
2787           for (l=0; l<max2; l++,ct2++) {
2788             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2789             *mat_a++ = rbuf4_i[ct2];
2790             ilen++;
2791           }
2792         }
2793         imat_ilen[row] = ilen;
2794       }
2795     }
2796   }
2797 
2798   if (!iscsorted) { /* sort column indices of the rows */
2799     for (i=0; i<ismax; i++) {
2800       subc      = (Mat_SeqAIJ*)submats[i]->data;
2801       imat_j    = subc->j;
2802       imat_i    = subc->i;
2803       imat_a    = subc->a;
2804       imat_ilen = subc->ilen;
2805 
2806       if (allcolumns[i]) continue;
2807       jmax = nrow[i];
2808       for (j=0; j<jmax; j++) {
2809         mat_i = imat_i[j];
2810         mat_a = imat_a + mat_i;
2811         mat_j = imat_j + mat_i;
2812         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a));
2813       }
2814     }
2815   }
2816 
2817   PetscCall(PetscFree(r_waits4));
2818   PetscCallMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE));
2819   PetscCall(PetscFree(s_waits4));
2820 
2821   /* Restore the indices */
2822   for (i=0; i<ismax; i++) {
2823     PetscCall(ISRestoreIndices(isrow[i],irow+i));
2824     if (!allcolumns[i]) {
2825       PetscCall(ISRestoreIndices(iscol[i],icol+i));
2826     }
2827   }
2828 
2829   for (i=0; i<ismax; i++) {
2830     PetscCall(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY));
2831     PetscCall(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY));
2832   }
2833 
2834   /* Destroy allocated memory */
2835   if (sbuf_aa) {
2836     PetscCall(PetscFree(sbuf_aa[0]));
2837     PetscCall(PetscFree(sbuf_aa));
2838   }
2839   PetscCall(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted));
2840 
2841   for (i=0; i<nrqs; ++i) {
2842     PetscCall(PetscFree(rbuf4[i]));
2843   }
2844   PetscCall(PetscFree(rbuf4));
2845 
2846   PetscCall(PetscFree4(row2proc,cmap,rmap,allcolumns));
2847   PetscFunctionReturn(0);
2848 }
2849 
2850 /*
2851  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2852  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2853  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2854  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2855  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2856  state, and needs to be "assembled" later by compressing B's column space.
2857 
2858  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2859  Following this call, C->A & C->B have been created, even if empty.
2860  */
2861 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2862 {
2863   /* If making this function public, change the error returned in this function away from _PLIB. */
2864   Mat_MPIAIJ     *aij;
2865   Mat_SeqAIJ     *Baij;
2866   PetscBool      seqaij,Bdisassembled;
2867   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2868   PetscScalar    v;
2869   const PetscInt *rowindices,*colindices;
2870 
2871   PetscFunctionBegin;
2872   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2873   if (A) {
2874     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij));
2875     PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2876     if (rowemb) {
2877       PetscCall(ISGetLocalSize(rowemb,&m));
2878       PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2879     } else {
2880       if (C->rmap->n != A->rmap->n) {
2881         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2882       }
2883     }
2884     if (dcolemb) {
2885       PetscCall(ISGetLocalSize(dcolemb,&n));
2886       PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n);
2887     } else {
2888       PetscCheck(C->cmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2889     }
2890   }
2891   if (B) {
2892     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij));
2893     PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2894     if (rowemb) {
2895       PetscCall(ISGetLocalSize(rowemb,&m));
2896       PetscCheck(m == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2897     } else {
2898       if (C->rmap->n != B->rmap->n) {
2899         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2900       }
2901     }
2902     if (ocolemb) {
2903       PetscCall(ISGetLocalSize(ocolemb,&n));
2904       PetscCheck(n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n);
2905     } else {
2906       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2907     }
2908   }
2909 
2910   aij = (Mat_MPIAIJ*)C->data;
2911   if (!aij->A) {
2912     /* Mimic parts of MatMPIAIJSetPreallocation() */
2913     PetscCall(MatCreate(PETSC_COMM_SELF,&aij->A));
2914     PetscCall(MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n));
2915     PetscCall(MatSetBlockSizesFromMats(aij->A,C,C));
2916     PetscCall(MatSetType(aij->A,MATSEQAIJ));
2917     PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A));
2918   }
2919   if (A) {
2920     PetscCall(MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A));
2921   } else {
2922     PetscCall(MatSetUp(aij->A));
2923   }
2924   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2925     /*
2926       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2927       need to "disassemble" B -- convert it to using C's global indices.
2928       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2929 
2930       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2931       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2932 
2933       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2934       At least avoid calling MatSetValues() and the implied searches?
2935     */
2936 
2937     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2938 #if defined(PETSC_USE_CTABLE)
2939       PetscCall(PetscTableDestroy(&aij->colmap));
2940 #else
2941       PetscCall(PetscFree(aij->colmap));
2942       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2943       if (aij->B) {
2944         PetscCall(PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt)));
2945       }
2946 #endif
2947       ngcol = 0;
2948       if (aij->lvec) {
2949         PetscCall(VecGetSize(aij->lvec,&ngcol));
2950       }
2951       if (aij->garray) {
2952         PetscCall(PetscFree(aij->garray));
2953         PetscCall(PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt)));
2954       }
2955       PetscCall(VecDestroy(&aij->lvec));
2956       PetscCall(VecScatterDestroy(&aij->Mvctx));
2957     }
2958     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2959       PetscCall(MatDestroy(&aij->B));
2960     }
2961     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2962       PetscCall(MatZeroEntries(aij->B));
2963     }
2964   }
2965   Bdisassembled = PETSC_FALSE;
2966   if (!aij->B) {
2967     PetscCall(MatCreate(PETSC_COMM_SELF,&aij->B));
2968     PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B));
2969     PetscCall(MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N));
2970     PetscCall(MatSetBlockSizesFromMats(aij->B,B,B));
2971     PetscCall(MatSetType(aij->B,MATSEQAIJ));
2972     Bdisassembled = PETSC_TRUE;
2973   }
2974   if (B) {
2975     Baij = (Mat_SeqAIJ*)B->data;
2976     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2977       PetscCall(PetscMalloc1(B->rmap->n,&nz));
2978       for (i=0; i<B->rmap->n; i++) {
2979         nz[i] = Baij->i[i+1] - Baij->i[i];
2980       }
2981       PetscCall(MatSeqAIJSetPreallocation(aij->B,0,nz));
2982       PetscCall(PetscFree(nz));
2983     }
2984 
2985     PetscCall(PetscLayoutGetRange(C->rmap,&rstart,&rend));
2986     shift = rend-rstart;
2987     count = 0;
2988     rowindices = NULL;
2989     colindices = NULL;
2990     if (rowemb) {
2991       PetscCall(ISGetIndices(rowemb,&rowindices));
2992     }
2993     if (ocolemb) {
2994       PetscCall(ISGetIndices(ocolemb,&colindices));
2995     }
2996     for (i=0; i<B->rmap->n; i++) {
2997       PetscInt row;
2998       row = i;
2999       if (rowindices) row = rowindices[i];
3000       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3001         col  = Baij->j[count];
3002         if (colindices) col = colindices[col];
3003         if (Bdisassembled && col>=rstart) col += shift;
3004         v    = Baij->a[count];
3005         PetscCall(MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES));
3006         ++count;
3007       }
3008     }
3009     /* No assembly for aij->B is necessary. */
3010     /* FIXME: set aij->B's nonzerostate correctly. */
3011   } else {
3012     PetscCall(MatSetUp(aij->B));
3013   }
3014   C->preallocated  = PETSC_TRUE;
3015   C->was_assembled = PETSC_FALSE;
3016   C->assembled     = PETSC_FALSE;
3017    /*
3018       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3019       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3020    */
3021   PetscFunctionReturn(0);
3022 }
3023 
3024 /*
3025   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3026  */
3027 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3028 {
3029   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3030 
3031   PetscFunctionBegin;
3032   PetscValidPointer(A,2);
3033   PetscValidPointer(B,3);
3034   /* FIXME: make sure C is assembled */
3035   *A = aij->A;
3036   *B = aij->B;
3037   /* Note that we don't incref *A and *B, so be careful! */
3038   PetscFunctionReturn(0);
3039 }
3040 
3041 /*
3042   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3043   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3044 */
3045 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3046                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3047                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3048                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3049                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3050 {
3051   PetscMPIInt    size,flag;
3052   PetscInt       i,ii,cismax,ispar;
3053   Mat            *A,*B;
3054   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3055 
3056   PetscFunctionBegin;
3057   if (!ismax) PetscFunctionReturn(0);
3058 
3059   for (i = 0, cismax = 0; i < ismax; ++i) {
3060     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag));
3061     PetscCheck(flag == MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3062     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3063     if (size > 1) ++cismax;
3064   }
3065 
3066   /*
3067      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3068      ispar counts the number of parallel ISs across C's comm.
3069   */
3070   PetscCall(MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
3071   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3072     PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat));
3073     PetscFunctionReturn(0);
3074   }
3075 
3076   /* if (ispar) */
3077   /*
3078     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3079     These are used to extract the off-diag portion of the resulting parallel matrix.
3080     The row IS for the off-diag portion is the same as for the diag portion,
3081     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3082   */
3083   PetscCall(PetscMalloc2(cismax,&cisrow,cismax,&ciscol));
3084   PetscCall(PetscMalloc1(cismax,&ciscol_p));
3085   for (i = 0, ii = 0; i < ismax; ++i) {
3086     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3087     if (size > 1) {
3088       /*
3089          TODO: This is the part that's ***NOT SCALABLE***.
3090          To fix this we need to extract just the indices of C's nonzero columns
3091          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3092          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3093          to be done without serializing on the IS list, so, most likely, it is best
3094          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3095       */
3096       PetscCall(ISGetNonlocalIS(iscol[i],&(ciscol[ii])));
3097       /* Now we have to
3098          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3099              were sorted on each rank, concatenated they might no longer be sorted;
3100          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3101              indices in the nondecreasing order to the original index positions.
3102          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3103       */
3104       PetscCall(ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii));
3105       PetscCall(ISSort(ciscol[ii]));
3106       ++ii;
3107     }
3108   }
3109   PetscCall(PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p));
3110   for (i = 0, ii = 0; i < ismax; ++i) {
3111     PetscInt       j,issize;
3112     const PetscInt *indices;
3113 
3114     /*
3115        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3116      */
3117     PetscCall(ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i));
3118     PetscCall(ISSort(isrow[i]));
3119     PetscCall(ISGetLocalSize(isrow[i],&issize));
3120     PetscCall(ISGetIndices(isrow[i],&indices));
3121     for (j = 1; j < issize; ++j) {
3122       PetscCheck(indices[j] != indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3123     }
3124     PetscCall(ISRestoreIndices(isrow[i],&indices));
3125     PetscCall(ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i));
3126     PetscCall(ISSort(iscol[i]));
3127     PetscCall(ISGetLocalSize(iscol[i],&issize));
3128     PetscCall(ISGetIndices(iscol[i],&indices));
3129     for (j = 1; j < issize; ++j) {
3130       PetscCheck(indices[j-1] != indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3131     }
3132     PetscCall(ISRestoreIndices(iscol[i],&indices));
3133     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3134     if (size > 1) {
3135       cisrow[ii] = isrow[i];
3136       ++ii;
3137     }
3138   }
3139   /*
3140     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3141     array of sequential matrices underlying the resulting parallel matrices.
3142     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3143     contain duplicates.
3144 
3145     There are as many diag matrices as there are original index sets. There are only as many parallel
3146     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3147 
3148     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3149     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3150       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3151       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3152       with A[i] and B[ii] extracted from the corresponding MPI submat.
3153     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3154       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3155       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3156       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3157       values into A[i] and B[ii] sitting inside the corresponding submat.
3158     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3159       A[i], B[ii], AA[i] or BB[ii] matrices.
3160   */
3161   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3162   if (scall == MAT_INITIAL_MATRIX) {
3163     PetscCall(PetscMalloc1(ismax,submat));
3164   }
3165 
3166   /* Now obtain the sequential A and B submatrices separately. */
3167   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3168   PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A));
3169   PetscCall((*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B));
3170 
3171   /*
3172     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3173     matrices A & B have been extracted directly into the parallel matrices containing them, or
3174     simply into the sequential matrix identical with the corresponding A (if size == 1).
3175     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3176     to have the same sparsity pattern.
3177     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3178     must be constructed for C. This is done by setseqmat(s).
3179   */
3180   for (i = 0, ii = 0; i < ismax; ++i) {
3181     /*
3182        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3183        That way we can avoid sorting and computing permutations when reusing.
3184        To this end:
3185         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3186         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3187           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3188     */
3189     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3190 
3191     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3192     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3193     if (size > 1) {
3194       if (scall == MAT_INITIAL_MATRIX) {
3195         PetscCall(MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i));
3196         PetscCall(MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE));
3197         PetscCall(MatSetType((*submat)[i],MATMPIAIJ));
3198         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3199         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3200       }
3201       /*
3202         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3203       */
3204       {
3205         Mat AA = A[i],BB = B[ii];
3206 
3207         if (AA || BB) {
3208           PetscCall(setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB));
3209           PetscCall(MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY));
3210           PetscCall(MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY));
3211         }
3212         PetscCall(MatDestroy(&AA));
3213       }
3214       PetscCall(ISDestroy(ciscol+ii));
3215       PetscCall(ISDestroy(ciscol_p+ii));
3216       ++ii;
3217     } else { /* if (size == 1) */
3218       if (scall == MAT_REUSE_MATRIX) {
3219         PetscCall(MatDestroy(&(*submat)[i]));
3220       }
3221       if (isrow_p[i] || iscol_p[i]) {
3222         PetscCall(MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i));
3223         PetscCall(setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]));
3224         /* Otherwise A is extracted straight into (*submats)[i]. */
3225         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3226         PetscCall(MatDestroy(A+i));
3227       } else (*submat)[i] = A[i];
3228     }
3229     PetscCall(ISDestroy(&isrow_p[i]));
3230     PetscCall(ISDestroy(&iscol_p[i]));
3231   }
3232   PetscCall(PetscFree2(cisrow,ciscol));
3233   PetscCall(PetscFree2(isrow_p,iscol_p));
3234   PetscCall(PetscFree(ciscol_p));
3235   PetscCall(PetscFree(A));
3236   PetscCall(MatDestroySubMatrices(cismax,&B));
3237   PetscFunctionReturn(0);
3238 }
3239 
3240 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3241 {
3242   PetscFunctionBegin;
3243   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ));
3244   PetscFunctionReturn(0);
3245 }
3246