xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 11486bccf1aeea1ca5536228f99d437b39bdaca6)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscInt       i;
24 
25   PetscFunctionBegin;
26   PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
27   for (i=0; i<ov; ++i) {
28     PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C,imax,is));
29   }
30   PetscFunctionReturn(0);
31 }
32 
33 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
34 {
35   PetscInt       i;
36 
37   PetscFunctionBegin;
38   PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
39   for (i=0; i<ov; ++i) {
40     PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is));
41   }
42   PetscFunctionReturn(0);
43 }
44 
45 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
46 {
47   MPI_Comm       comm;
48   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
49   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
50   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
51   const PetscInt *indices_i,**indices;
52   PetscLayout    rmap;
53   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
54   PetscSF        sf;
55   PetscSFNode    *remote;
56 
57   PetscFunctionBegin;
58   PetscCall(PetscObjectGetComm((PetscObject)mat,&comm));
59   PetscCallMPI(MPI_Comm_rank(comm,&rank));
60   PetscCallMPI(MPI_Comm_size(comm,&size));
61   /* get row map to determine where rows should be going */
62   PetscCall(MatGetLayouts(mat,&rmap,NULL));
63   /* retrieve IS data and put all together so that we
64    * can optimize communication
65    *  */
66   PetscCall(PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length));
67   for (i=0,tlength=0; i<nidx; i++) {
68     PetscCall(ISGetLocalSize(is[i],&length[i]));
69     tlength += length[i];
70     PetscCall(ISGetIndices(is[i],&indices[i]));
71   }
72   /* find these rows on remote processors */
73   PetscCall(PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids));
74   PetscCall(PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp));
75   nrrows = 0;
76   for (i=0; i<nidx; i++) {
77     length_i  = length[i];
78     indices_i = indices[i];
79     for (j=0; j<length_i; j++) {
80       owner = -1;
81       PetscCall(PetscLayoutFindOwner(rmap,indices_i[j],&owner));
82       /* remote processors */
83       if (owner != rank) {
84         tosizes_temp[owner]++; /* number of rows to owner */
85         rrow_ranks[nrrows]  = owner; /* processor */
86         rrow_isids[nrrows]   = i; /* is id */
87         remoterows[nrrows++] = indices_i[j]; /* row */
88       }
89     }
90     PetscCall(ISRestoreIndices(is[i],&indices[i]));
91   }
92   PetscCall(PetscFree2(*(PetscInt***)&indices,length));
93   /* test if we need to exchange messages
94    * generally speaking, we do not need to exchange
95    * data when overlap is 1
96    * */
97   PetscCall(MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm));
98   /* we do not have any messages
99    * It usually corresponds to overlap 1
100    * */
101   if (!reducednrrows) {
102     PetscCall(PetscFree3(toranks,tosizes,tosizes_temp));
103     PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids));
104     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is));
105     PetscFunctionReturn(0);
106   }
107   nto = 0;
108   /* send sizes and ranks for building a two-sided communcation */
109   for (i=0; i<size; i++) {
110     if (tosizes_temp[i]) {
111       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
112       tosizes_temp[i] = nto; /* a map from processor to index */
113       toranks[nto++]  = i; /* processor */
114     }
115   }
116   PetscCall(PetscMalloc1(nto+1,&toffsets));
117   toffsets[0] = 0;
118   for (i=0; i<nto; i++) {
119     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
120     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
121   }
122   /* send information to other processors */
123   PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes));
124   nrecvrows = 0;
125   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
126   PetscCall(PetscMalloc1(nrecvrows,&remote));
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) {
129     for (j=0; j<fromsizes[2*i]; j++) {
130       remote[nrecvrows].rank    = fromranks[i];
131       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
132     }
133   }
134   PetscCall(PetscSFCreate(comm,&sf));
135   PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER));
136   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
137   PetscCall(PetscSFSetType(sf,PETSCSFBASIC));
138   PetscCall(PetscSFSetFromOptions(sf));
139   /* message pair <no of is, row>  */
140   PetscCall(PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata));
141   for (i=0; i<nrrows; i++) {
142     owner = rrow_ranks[i]; /* processor */
143     j     = tosizes_temp[owner]; /* index */
144     todata[toffsets[j]++] = rrow_isids[i];
145     todata[toffsets[j]++] = remoterows[i];
146   }
147   PetscCall(PetscFree3(toranks,tosizes,tosizes_temp));
148   PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids));
149   PetscCall(PetscFree(toffsets));
150   PetscCall(PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE));
151   PetscCall(PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE));
152   PetscCall(PetscSFDestroy(&sf));
153   /* send rows belonging to the remote so that then we could get the overlapping data back */
154   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata));
155   PetscCall(PetscFree2(todata,fromdata));
156   PetscCall(PetscFree(fromsizes));
157   PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes));
158   PetscCall(PetscFree(fromranks));
159   nrecvrows = 0;
160   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
161   PetscCall(PetscCalloc1(nrecvrows,&todata));
162   PetscCall(PetscMalloc1(nrecvrows,&remote));
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) {
165     for (j=0; j<tosizes[2*i]; j++) {
166       remote[nrecvrows].rank    = toranks[i];
167       remote[nrecvrows++].index = tosizes[2*i+1]+j;
168     }
169   }
170   PetscCall(PetscSFCreate(comm,&sf));
171   PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER));
172   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
173   PetscCall(PetscSFSetType(sf,PETSCSFBASIC));
174   PetscCall(PetscSFSetFromOptions(sf));
175   /* overlap communication and computation */
176   PetscCall(PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE));
177   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is));
178   PetscCall(PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE));
179   PetscCall(PetscSFDestroy(&sf));
180   PetscCall(PetscFree2(sbdata,sbsizes));
181   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata));
182   PetscCall(PetscFree(toranks));
183   PetscCall(PetscFree(tosizes));
184   PetscCall(PetscFree(todata));
185   PetscFunctionReturn(0);
186 }
187 
188 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
189 {
190   PetscInt         *isz,isz_i,i,j,is_id, data_size;
191   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
192   const PetscInt   *indices_i_temp;
193   MPI_Comm         *iscomms;
194 
195   PetscFunctionBegin;
196   max_lsize = 0;
197   PetscCall(PetscMalloc1(nidx,&isz));
198   for (i=0; i<nidx; i++) {
199     PetscCall(ISGetLocalSize(is[i],&lsize));
200     max_lsize = lsize>max_lsize ? lsize:max_lsize;
201     isz[i]    = lsize;
202   }
203   PetscCall(PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms));
204   for (i=0; i<nidx; i++) {
205     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL));
206     PetscCall(ISGetIndices(is[i],&indices_i_temp));
207     PetscCall(PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]));
208     PetscCall(ISRestoreIndices(is[i],&indices_i_temp));
209     PetscCall(ISDestroy(&is[i]));
210   }
211   /* retrieve information to get row id and its overlap */
212   for (i=0; i<nrecvs;) {
213     is_id     = recvdata[i++];
214     data_size = recvdata[i++];
215     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
216     isz_i     = isz[is_id];
217     for (j=0; j< data_size; j++) {
218       col = recvdata[i++];
219       indices_i[isz_i++] = col;
220     }
221     isz[is_id] = isz_i;
222   }
223   /* remove duplicate entities */
224   for (i=0; i<nidx; i++) {
225     indices_i = indices_temp+(max_lsize+nrecvs)*i;
226     isz_i     = isz[i];
227     PetscCall(PetscSortRemoveDupsInt(&isz_i,indices_i));
228     PetscCall(ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]));
229     PetscCall(PetscCommDestroy(&iscomms[i]));
230   }
231   PetscCall(PetscFree(isz));
232   PetscCall(PetscFree2(indices_temp,iscomms));
233   PetscFunctionReturn(0);
234 }
235 
236 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
237 {
238   PetscLayout       rmap,cmap;
239   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
240   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
241   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
242   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
243   Mat               amat,bmat;
244   PetscMPIInt       rank;
245   PetscBool         done;
246   MPI_Comm          comm;
247 
248   PetscFunctionBegin;
249   PetscCall(PetscObjectGetComm((PetscObject)mat,&comm));
250   PetscCallMPI(MPI_Comm_rank(comm,&rank));
251   PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols));
252   /* Even if the mat is symmetric, we still assume it is not symmetric */
253   PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
254   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
255   PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
256   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
257   /* total number of nonzero values is used to estimate the memory usage in the next step */
258   tnz  = ai[an]+bi[bn];
259   PetscCall(MatGetLayouts(mat,&rmap,&cmap));
260   PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL));
261   PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL));
262   /* to find the longest message */
263   max_fszs = 0;
264   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
265   /* better way to estimate number of nonzero in the mat??? */
266   PetscCall(PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp));
267   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
268   rows_pos  = 0;
269   totalrows = 0;
270   for (i=0; i<nfrom; i++) {
271     PetscCall(PetscArrayzero(rows_pos_i,nidx));
272     /* group data together */
273     for (j=0; j<fromsizes[2*i]; j+=2) {
274       is_id                       = fromrows[rows_pos++];/* no of is */
275       rows_i                      = rows_data[is_id];
276       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
277     }
278     /* estimate a space to avoid multiple allocations  */
279     for (j=0; j<nidx; j++) {
280       indvc_ij = 0;
281       rows_i   = rows_data[j];
282       for (l=0; l<rows_pos_i[j]; l++) {
283         row    = rows_i[l]-rstart;
284         start  = ai[row];
285         end    = ai[row+1];
286         for (k=start; k<end; k++) { /* Amat */
287           col = aj[k] + cstart;
288           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
289         }
290         start = bi[row];
291         end   = bi[row+1];
292         for (k=start; k<end; k++) { /* Bmat */
293           col = gcols[bj[k]];
294           indices_tmp[indvc_ij++] = col;
295         }
296       }
297       PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp));
298       indv_counts[i*nidx+j] = indvc_ij;
299       totalrows            += indvc_ij;
300     }
301   }
302   /* message triple <no of is, number of rows, rows> */
303   PetscCall(PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes));
304   totalrows = 0;
305   rows_pos  = 0;
306   /* use this code again */
307   for (i=0;i<nfrom;i++) {
308     PetscCall(PetscArrayzero(rows_pos_i,nidx));
309     for (j=0; j<fromsizes[2*i]; j+=2) {
310       is_id                       = fromrows[rows_pos++];
311       rows_i                      = rows_data[is_id];
312       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
313     }
314     /* add data  */
315     for (j=0; j<nidx; j++) {
316       if (!indv_counts[i*nidx+j]) continue;
317       indvc_ij = 0;
318       sbdata[totalrows++] = j;
319       sbdata[totalrows++] = indv_counts[i*nidx+j];
320       sbsizes[2*i]       += 2;
321       rows_i              = rows_data[j];
322       for (l=0; l<rows_pos_i[j]; l++) {
323         row   = rows_i[l]-rstart;
324         start = ai[row];
325         end   = ai[row+1];
326         for (k=start; k<end; k++) { /* Amat */
327           col = aj[k] + cstart;
328           indices_tmp[indvc_ij++] = col;
329         }
330         start = bi[row];
331         end   = bi[row+1];
332         for (k=start; k<end; k++) { /* Bmat */
333           col = gcols[bj[k]];
334           indices_tmp[indvc_ij++] = col;
335         }
336       }
337       PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp));
338       sbsizes[2*i]  += indvc_ij;
339       PetscCall(PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij));
340       totalrows += indvc_ij;
341     }
342   }
343   PetscCall(PetscMalloc1(nfrom+1,&offsets));
344   offsets[0] = 0;
345   for (i=0; i<nfrom; i++) {
346     offsets[i+1]   = offsets[i] + sbsizes[2*i];
347     sbsizes[2*i+1] = offsets[i];
348   }
349   PetscCall(PetscFree(offsets));
350   if (sbrowsizes) *sbrowsizes = sbsizes;
351   if (sbrows) *sbrows = sbdata;
352   PetscCall(PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp));
353   PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
354   PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
355   PetscFunctionReturn(0);
356 }
357 
358 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
359 {
360   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
361   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
362   PetscInt          lsize,lsize_tmp;
363   PetscMPIInt       rank,owner;
364   Mat               amat,bmat;
365   PetscBool         done;
366   PetscLayout       cmap,rmap;
367   MPI_Comm          comm;
368 
369   PetscFunctionBegin;
370   PetscCall(PetscObjectGetComm((PetscObject)mat,&comm));
371   PetscCallMPI(MPI_Comm_rank(comm,&rank));
372   PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols));
373   PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
374   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
375   PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
376   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
377   /* is it a safe way to compute number of nonzero values ? */
378   tnz  = ai[an]+bi[bn];
379   PetscCall(MatGetLayouts(mat,&rmap,&cmap));
380   PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL));
381   PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL));
382   /* it is a better way to estimate memory than the old implementation
383    * where global size of matrix is used
384    * */
385   PetscCall(PetscMalloc1(tnz,&indices_temp));
386   for (i=0; i<nidx; i++) {
387     MPI_Comm iscomm;
388 
389     PetscCall(ISGetLocalSize(is[i],&lsize));
390     PetscCall(ISGetIndices(is[i],&indices));
391     lsize_tmp = 0;
392     for (j=0; j<lsize; j++) {
393       owner = -1;
394       row   = indices[j];
395       PetscCall(PetscLayoutFindOwner(rmap,row,&owner));
396       if (owner != rank) continue;
397       /* local number */
398       row  -= rstart;
399       start = ai[row];
400       end   = ai[row+1];
401       for (k=start; k<end; k++) { /* Amat */
402         col = aj[k] + cstart;
403         indices_temp[lsize_tmp++] = col;
404       }
405       start = bi[row];
406       end   = bi[row+1];
407       for (k=start; k<end; k++) { /* Bmat */
408         col = gcols[bj[k]];
409         indices_temp[lsize_tmp++] = col;
410       }
411     }
412    PetscCall(ISRestoreIndices(is[i],&indices));
413    PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL));
414    PetscCall(ISDestroy(&is[i]));
415    PetscCall(PetscSortRemoveDupsInt(&lsize_tmp,indices_temp));
416    PetscCall(ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]));
417    PetscCall(PetscCommDestroy(&iscomm));
418   }
419   PetscCall(PetscFree(indices_temp));
420   PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
421   PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
422   PetscFunctionReturn(0);
423 }
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests received (which have to be or which have been processed)
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
461   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
462   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
463   PetscBT        *table;
464   MPI_Comm       comm;
465   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
466   MPI_Status     *recv_status;
467   MPI_Comm       *iscomms;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag1));
477   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
478 
479   PetscCall(PetscMalloc2(imax,(PetscInt***)&idx,imax,&n));
480 
481   for (i=0; i<imax; i++) {
482     PetscCall(ISGetIndices(is[i],&idx[i]));
483     PetscCall(ISGetLocalSize(is[i],&n[i]));
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));
489   for (i=0; i<imax; i++) {
490     PetscCall(PetscArrayzero(w4,size)); /* initialise work vector*/
491     idx_i = idx[i];
492     len   = n[i];
493     for (j=0; j<len; j++) {
494       row = idx_i[j];
495       PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
496       PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc));
497       w4[proc]++;
498     }
499     for (j=0; j<size; j++) {
500       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
501     }
502   }
503 
504   nrqs     = 0;              /* no of outgoing messages */
505   msz      = 0;              /* total mesg length (for all proc */
506   w1[rank] = 0;              /* no mesg sent to intself */
507   w3[rank] = 0;
508   for (i=0; i<size; i++) {
509     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
510   }
511   /* pa - is list of processors to communicate with */
512   PetscCall(PetscMalloc1(nrqs,&pa));
513   for (i=0,j=0; i<size; i++) {
514     if (w1[i]) {pa[j] = i; j++;}
515   }
516 
517   /* Each message would have a header = 1 + 2*(no of IS) + data */
518   for (i=0; i<nrqs; i++) {
519     j      = pa[i];
520     w1[j] += w2[j] + 2*w3[j];
521     msz   += w1[j];
522   }
523 
524   /* Determine the number of messages to expect, their lengths, from from-ids */
525   PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
526   PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
527 
528   /* Now post the Irecvs corresponding to these messages */
529   PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1));
530 
531   /* Allocate Memory for outgoing messages */
532   PetscCall(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr));
533   PetscCall(PetscArrayzero(outdat,size));
534   PetscCall(PetscArrayzero(ptr,size));
535 
536   {
537     PetscInt *iptr = tmp,ict  = 0;
538     for (i=0; i<nrqs; i++) {
539       j         = pa[i];
540       iptr     +=  ict;
541       outdat[j] = iptr;
542       ict       = w1[j];
543     }
544   }
545 
546   /* Form the outgoing messages */
547   /* plug in the headers */
548   for (i=0; i<nrqs; i++) {
549     j            = pa[i];
550     outdat[j][0] = 0;
551     PetscCall(PetscArrayzero(outdat[j]+1,2*w3[j]));
552     ptr[j]       = outdat[j] + 2*w3[j] + 1;
553   }
554 
555   /* Memory for doing local proc's work */
556   {
557     PetscInt M_BPB_imax = 0;
558 #if defined(PETSC_USE_CTABLE)
559     PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax));
560     PetscCall(PetscMalloc1(imax,&table_data));
561     for (i=0; i<imax; i++) {
562       PetscCall(PetscTableCreate(n[i],M,&table_data[i]));
563     }
564     PetscCall(PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p));
565     for (i=0; i<imax; i++) {
566       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
567     }
568 #else
569     PetscInt Mimax = 0;
570     PetscCall(PetscIntMultError(M,imax, &Mimax));
571     PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax));
572     PetscCall(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p));
573     for (i=0; i<imax; i++) {
574       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
575       data[i]  = d_p + M*i;
576     }
577 #endif
578   }
579 
580   /* Parse the IS and update local tables and the outgoing buf with the data */
581   {
582     PetscInt n_i,isz_i,*outdat_j,ctr_j;
583     PetscBT  table_i;
584 
585     for (i=0; i<imax; i++) {
586       PetscCall(PetscArrayzero(ctr,size));
587       n_i     = n[i];
588       table_i = table[i];
589       idx_i   = idx[i];
590 #if defined(PETSC_USE_CTABLE)
591       table_data_i = table_data[i];
592 #else
593       data_i  = data[i];
594 #endif
595       isz_i   = isz[i];
596       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
597         row  = idx_i[j];
598         PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc));
599         if (proc != rank) { /* copy to the outgoing buffer */
600           ctr[proc]++;
601           *ptr[proc] = row;
602           ptr[proc]++;
603         } else if (!PetscBTLookupSet(table_i,row)) {
604 #if defined(PETSC_USE_CTABLE)
605           PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES));
606 #else
607           data_i[isz_i] = row; /* Update the local table */
608 #endif
609           isz_i++;
610         }
611       }
612       /* Update the headers for the current IS */
613       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
614         if ((ctr_j = ctr[j])) {
615           outdat_j        = outdat[j];
616           k               = ++outdat_j[0];
617           outdat_j[2*k]   = ctr_j;
618           outdat_j[2*k-1] = i;
619         }
620       }
621       isz[i] = isz_i;
622     }
623   }
624 
625   /*  Now  post the sends */
626   PetscCall(PetscMalloc1(nrqs,&s_waits1));
627   for (i=0; i<nrqs; ++i) {
628     j    = pa[i];
629     PetscCallMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i));
630   }
631 
632   /* No longer need the original indices */
633   PetscCall(PetscMalloc1(imax,&iscomms));
634   for (i=0; i<imax; ++i) {
635     PetscCall(ISRestoreIndices(is[i],idx+i));
636     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL));
637   }
638   PetscCall(PetscFree2(*(PetscInt***)&idx,n));
639 
640   for (i=0; i<imax; ++i) {
641     PetscCall(ISDestroy(&is[i]));
642   }
643 
644   /* Do Local work */
645 #if defined(PETSC_USE_CTABLE)
646   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data));
647 #else
648   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL));
649 #endif
650 
651   /* Receive messages */
652   PetscCall(PetscMalloc1(nrqr,&recv_status));
653   PetscCallMPI(MPI_Waitall(nrqr,r_waits1,recv_status));
654   PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
655 
656   /* Phase 1 sends are complete - deallocate buffers */
657   PetscCall(PetscFree4(outdat,ptr,tmp,ctr));
658   PetscCall(PetscFree4(w1,w2,w3,w4));
659 
660   PetscCall(PetscMalloc1(nrqr,&xdata));
661   PetscCall(PetscMalloc1(nrqr,&isz1));
662   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1));
663   PetscCall(PetscFree(rbuf[0]));
664   PetscCall(PetscFree(rbuf));
665 
666   /* Send the data back */
667   /* Do a global reduction to know the buffer space req for incoming messages */
668   {
669     PetscMPIInt *rw1;
670 
671     PetscCall(PetscCalloc1(size,&rw1));
672 
673     for (i=0; i<nrqr; ++i) {
674       proc = recv_status[i].MPI_SOURCE;
675 
676       PetscCheck(proc == onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
677       rw1[proc] = isz1[i];
678     }
679     PetscCall(PetscFree(onodes1));
680     PetscCall(PetscFree(olengths1));
681 
682     /* Determine the number of messages to expect, their lengths, from from-ids */
683     PetscCall(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2));
684     PetscCall(PetscFree(rw1));
685   }
686   /* Now post the Irecvs corresponding to these messages */
687   PetscCall(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2));
688 
689   /* Now  post the sends */
690   PetscCall(PetscMalloc1(nrqr,&s_waits2));
691   for (i=0; i<nrqr; ++i) {
692     j    = recv_status[i].MPI_SOURCE;
693     PetscCallMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i));
694   }
695 
696   /* receive work done on other processors */
697   {
698     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
699     PetscMPIInt idex;
700     PetscBT     table_i;
701 
702     for (i=0; i<nrqs; ++i) {
703       PetscCallMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE));
704       /* Process the message */
705       rbuf2_i = rbuf2[idex];
706       ct1     = 2*rbuf2_i[0]+1;
707       jmax    = rbuf2[idex][0];
708       for (j=1; j<=jmax; j++) {
709         max     = rbuf2_i[2*j];
710         is_no   = rbuf2_i[2*j-1];
711         isz_i   = isz[is_no];
712         table_i = table[is_no];
713 #if defined(PETSC_USE_CTABLE)
714         table_data_i = table_data[is_no];
715 #else
716         data_i  = data[is_no];
717 #endif
718         for (k=0; k<max; k++,ct1++) {
719           row = rbuf2_i[ct1];
720           if (!PetscBTLookupSet(table_i,row)) {
721 #if defined(PETSC_USE_CTABLE)
722             PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES));
723 #else
724             data_i[isz_i] = row;
725 #endif
726             isz_i++;
727           }
728         }
729         isz[is_no] = isz_i;
730       }
731     }
732 
733     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
734   }
735 
736 #if defined(PETSC_USE_CTABLE)
737   tcount_max = 0;
738   for (i=0; i<imax; ++i) {
739     table_data_i = table_data[i];
740     PetscCall(PetscTableGetCount(table_data_i,&tcount));
741     if (tcount_max < tcount) tcount_max = tcount;
742   }
743   PetscCall(PetscMalloc1(tcount_max+1,&tdata));
744 #endif
745 
746   for (i=0; i<imax; ++i) {
747 #if defined(PETSC_USE_CTABLE)
748     PetscTablePosition tpos;
749     table_data_i = table_data[i];
750 
751     PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos));
752     while (tpos) {
753       PetscCall(PetscTableGetNext(table_data_i,&tpos,&k,&j));
754       tdata[--j] = --k;
755     }
756     PetscCall(ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i));
757 #else
758     PetscCall(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i));
759 #endif
760     PetscCall(PetscCommDestroy(&iscomms[i]));
761   }
762 
763   PetscCall(PetscFree(iscomms));
764   PetscCall(PetscFree(onodes2));
765   PetscCall(PetscFree(olengths2));
766 
767   PetscCall(PetscFree(pa));
768   PetscCall(PetscFree(rbuf2[0]));
769   PetscCall(PetscFree(rbuf2));
770   PetscCall(PetscFree(s_waits1));
771   PetscCall(PetscFree(r_waits1));
772   PetscCall(PetscFree(s_waits2));
773   PetscCall(PetscFree(r_waits2));
774   PetscCall(PetscFree(recv_status));
775   if (xdata) {
776     PetscCall(PetscFree(xdata[0]));
777     PetscCall(PetscFree(xdata));
778   }
779   PetscCall(PetscFree(isz1));
780 #if defined(PETSC_USE_CTABLE)
781   for (i=0; i<imax; i++) {
782     PetscCall(PetscTableDestroy((PetscTable*)&table_data[i]));
783   }
784   PetscCall(PetscFree(table_data));
785   PetscCall(PetscFree(tdata));
786   PetscCall(PetscFree4(table,data,isz,t_p));
787 #else
788   PetscCall(PetscFree5(table,data,isz,d_p,t_p));
789 #endif
790   PetscFunctionReturn(0);
791 }
792 
793 /*
794    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
795        the work on the local processor.
796 
797      Inputs:
798       C      - MAT_MPIAIJ;
799       imax - total no of index sets processed at a time;
800       table  - an array of char - size = m bits.
801 
802      Output:
803       isz    - array containing the count of the solution elements corresponding
804                to each index set;
805       data or table_data  - pointer to the solutions
806 */
807 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
808 {
809   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
810   Mat        A  = c->A,B = c->B;
811   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
812   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
813   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
814   PetscBT    table_i;
815 #if defined(PETSC_USE_CTABLE)
816   PetscTable         table_data_i;
817   PetscTablePosition tpos;
818   PetscInt           tcount,*tdata;
819 #else
820   PetscInt           *data_i;
821 #endif
822 
823   PetscFunctionBegin;
824   rstart = C->rmap->rstart;
825   cstart = C->cmap->rstart;
826   ai     = a->i;
827   aj     = a->j;
828   bi     = b->i;
829   bj     = b->j;
830   garray = c->garray;
831 
832   for (i=0; i<imax; i++) {
833 #if defined(PETSC_USE_CTABLE)
834     /* copy existing entries of table_data_i into tdata[] */
835     table_data_i = table_data[i];
836     PetscCall(PetscTableGetCount(table_data_i,&tcount));
837     PetscCheck(tcount == isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]);
838 
839     PetscCall(PetscMalloc1(tcount,&tdata));
840     PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos));
841     while (tpos) {
842       PetscCall(PetscTableGetNext(table_data_i,&tpos,&row,&j));
843       tdata[--j] = --row;
844       PetscCheck(j <= tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount);
845     }
846 #else
847     data_i  = data[i];
848 #endif
849     table_i = table[i];
850     isz_i   = isz[i];
851     max     = isz[i];
852 
853     for (j=0; j<max; j++) {
854 #if defined(PETSC_USE_CTABLE)
855       row   = tdata[j] - rstart;
856 #else
857       row   = data_i[j] - rstart;
858 #endif
859       start = ai[row];
860       end   = ai[row+1];
861       for (k=start; k<end; k++) { /* Amat */
862         val = aj[k] + cstart;
863         if (!PetscBTLookupSet(table_i,val)) {
864 #if defined(PETSC_USE_CTABLE)
865           PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES));
866 #else
867           data_i[isz_i] = val;
868 #endif
869           isz_i++;
870         }
871       }
872       start = bi[row];
873       end   = bi[row+1];
874       for (k=start; k<end; k++) { /* Bmat */
875         val = garray[bj[k]];
876         if (!PetscBTLookupSet(table_i,val)) {
877 #if defined(PETSC_USE_CTABLE)
878           PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES));
879 #else
880           data_i[isz_i] = val;
881 #endif
882           isz_i++;
883         }
884       }
885     }
886     isz[i] = isz_i;
887 
888 #if defined(PETSC_USE_CTABLE)
889     PetscCall(PetscFree(tdata));
890 #endif
891   }
892   PetscFunctionReturn(0);
893 }
894 
895 /*
896       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
897          and return the output
898 
899          Input:
900            C    - the matrix
901            nrqr - no of messages being processed.
902            rbuf - an array of pointers to the received requests
903 
904          Output:
905            xdata - array of messages to be sent back
906            isz1  - size of each message
907 
908   For better efficiency perhaps we should malloc separately each xdata[i],
909 then if a remalloc is required we need only copy the data for that one row
910 rather then all previous rows as it is now where a single large chunk of
911 memory is used.
912 
913 */
914 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
915 {
916   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
917   Mat            A  = c->A,B = c->B;
918   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
919   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
920   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
921   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
922   PetscInt       *rbuf_i,kmax,rbuf_0;
923   PetscBT        xtable;
924 
925   PetscFunctionBegin;
926   m      = C->rmap->N;
927   rstart = C->rmap->rstart;
928   cstart = C->cmap->rstart;
929   ai     = a->i;
930   aj     = a->j;
931   bi     = b->i;
932   bj     = b->j;
933   garray = c->garray;
934 
935   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
936     rbuf_i =  rbuf[i];
937     rbuf_0 =  rbuf_i[0];
938     ct    += rbuf_0;
939     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
940   }
941 
942   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
943   else max1 = 1;
944   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
945   if (nrqr) {
946     PetscCall(PetscMalloc1(mem_estimate,&xdata[0]));
947     ++no_malloc;
948   }
949   PetscCall(PetscBTCreate(m,&xtable));
950   PetscCall(PetscArrayzero(isz1,nrqr));
951 
952   ct3 = 0;
953   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
954     rbuf_i =  rbuf[i];
955     rbuf_0 =  rbuf_i[0];
956     ct1    =  2*rbuf_0+1;
957     ct2    =  ct1;
958     ct3   += ct1;
959     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
960       PetscCall(PetscBTMemzero(m,xtable));
961       oct2 = ct2;
962       kmax = rbuf_i[2*j];
963       for (k=0; k<kmax; k++,ct1++) {
964         row = rbuf_i[ct1];
965         if (!PetscBTLookupSet(xtable,row)) {
966           if (!(ct3 < mem_estimate)) {
967             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
968             PetscCall(PetscMalloc1(new_estimate,&tmp));
969             PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
970             PetscCall(PetscFree(xdata[0]));
971             xdata[0]     = tmp;
972             mem_estimate = new_estimate; ++no_malloc;
973             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
974           }
975           xdata[i][ct2++] = row;
976           ct3++;
977         }
978       }
979       for (k=oct2,max2=ct2; k<max2; k++) {
980         row   = xdata[i][k] - rstart;
981         start = ai[row];
982         end   = ai[row+1];
983         for (l=start; l<end; l++) {
984           val = aj[l] + cstart;
985           if (!PetscBTLookupSet(xtable,val)) {
986             if (!(ct3 < mem_estimate)) {
987               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
988               PetscCall(PetscMalloc1(new_estimate,&tmp));
989               PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
990               PetscCall(PetscFree(xdata[0]));
991               xdata[0]     = tmp;
992               mem_estimate = new_estimate; ++no_malloc;
993               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
994             }
995             xdata[i][ct2++] = val;
996             ct3++;
997           }
998         }
999         start = bi[row];
1000         end   = bi[row+1];
1001         for (l=start; l<end; l++) {
1002           val = garray[bj[l]];
1003           if (!PetscBTLookupSet(xtable,val)) {
1004             if (!(ct3 < mem_estimate)) {
1005               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1006               PetscCall(PetscMalloc1(new_estimate,&tmp));
1007               PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
1008               PetscCall(PetscFree(xdata[0]));
1009               xdata[0]     = tmp;
1010               mem_estimate = new_estimate; ++no_malloc;
1011               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1012             }
1013             xdata[i][ct2++] = val;
1014             ct3++;
1015           }
1016         }
1017       }
1018       /* Update the header*/
1019       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1020       xdata[i][2*j-1] = rbuf_i[2*j-1];
1021     }
1022     xdata[i][0] = rbuf_0;
1023     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1024     isz1[i]     = ct2; /* size of each message */
1025   }
1026   PetscCall(PetscBTDestroy(&xtable));
1027   PetscCall(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc));
1028   PetscFunctionReturn(0);
1029 }
1030 /* -------------------------------------------------------------------------*/
1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1032 /*
1033     Every processor gets the entire matrix
1034 */
1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1036 {
1037   Mat            B;
1038   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1039   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1040   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1041   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1042   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1043 
1044   PetscFunctionBegin;
1045   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1046   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank));
1047   if (scall == MAT_INITIAL_MATRIX) {
1048     /* ----------------------------------------------------------------
1049          Tell every processor the number of nonzeros per row
1050     */
1051     PetscCall(PetscMalloc1(A->rmap->N,&lens));
1052     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1053       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1054     }
1055     PetscCall(PetscMalloc2(size,&recvcounts,size,&displs));
1056     for (i=0; i<size; i++) {
1057       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1058       displs[i]     = A->rmap->range[i];
1059     }
1060     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A)));
1061     /* ---------------------------------------------------------------
1062          Create the sequential matrix of the same type as the local block diagonal
1063     */
1064     PetscCall(MatCreate(PETSC_COMM_SELF,&B));
1065     PetscCall(MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE));
1066     PetscCall(MatSetBlockSizesFromMats(B,A,A));
1067     PetscCall(MatSetType(B,((PetscObject)a->A)->type_name));
1068     PetscCall(MatSeqAIJSetPreallocation(B,0,lens));
1069     PetscCall(PetscCalloc1(2,Bin));
1070     **Bin = B;
1071     b     = (Mat_SeqAIJ*)B->data;
1072 
1073     /*--------------------------------------------------------------------
1074        Copy my part of matrix column indices over
1075     */
1076     sendcount  = ad->nz + bd->nz;
1077     jsendbuf   = b->j + b->i[rstarts[rank]];
1078     a_jsendbuf = ad->j;
1079     b_jsendbuf = bd->j;
1080     n          = A->rmap->rend - A->rmap->rstart;
1081     cnt        = 0;
1082     for (i=0; i<n; i++) {
1083       /* put in lower diagonal portion */
1084       m = bd->i[i+1] - bd->i[i];
1085       while (m > 0) {
1086         /* is it above diagonal (in bd (compressed) numbering) */
1087         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1088         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1089         m--;
1090       }
1091 
1092       /* put in diagonal portion */
1093       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1094         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1095       }
1096 
1097       /* put in upper diagonal portion */
1098       while (m-- > 0) {
1099         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1100       }
1101     }
1102     PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1103 
1104     /*--------------------------------------------------------------------
1105        Gather all column indices to all processors
1106     */
1107     for (i=0; i<size; i++) {
1108       recvcounts[i] = 0;
1109       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1110         recvcounts[i] += lens[j];
1111       }
1112     }
1113     displs[0] = 0;
1114     for (i=1; i<size; i++) {
1115       displs[i] = displs[i-1] + recvcounts[i-1];
1116     }
1117     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A)));
1118     /*--------------------------------------------------------------------
1119         Assemble the matrix into useable form (note numerical values not yet set)
1120     */
1121     /* set the b->ilen (length of each row) values */
1122     PetscCall(PetscArraycpy(b->ilen,lens,A->rmap->N));
1123     /* set the b->i indices */
1124     b->i[0] = 0;
1125     for (i=1; i<=A->rmap->N; i++) {
1126       b->i[i] = b->i[i-1] + lens[i-1];
1127     }
1128     PetscCall(PetscFree(lens));
1129     PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1130     PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1131 
1132   } else {
1133     B = **Bin;
1134     b = (Mat_SeqAIJ*)B->data;
1135   }
1136 
1137   /*--------------------------------------------------------------------
1138        Copy my part of matrix numerical values into the values location
1139   */
1140   if (flag == MAT_GET_VALUES) {
1141     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1142     MatScalar         *sendbuf,*recvbuf;
1143 
1144     PetscCall(MatSeqAIJGetArrayRead(a->A,&ada));
1145     PetscCall(MatSeqAIJGetArrayRead(a->B,&bda));
1146     sendcount = ad->nz + bd->nz;
1147     sendbuf   = b->a + b->i[rstarts[rank]];
1148     a_sendbuf = ada;
1149     b_sendbuf = bda;
1150     b_sendj   = bd->j;
1151     n         = A->rmap->rend - A->rmap->rstart;
1152     cnt       = 0;
1153     for (i=0; i<n; i++) {
1154       /* put in lower diagonal portion */
1155       m = bd->i[i+1] - bd->i[i];
1156       while (m > 0) {
1157         /* is it above diagonal (in bd (compressed) numbering) */
1158         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1159         sendbuf[cnt++] = *b_sendbuf++;
1160         m--;
1161         b_sendj++;
1162       }
1163 
1164       /* put in diagonal portion */
1165       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1166         sendbuf[cnt++] = *a_sendbuf++;
1167       }
1168 
1169       /* put in upper diagonal portion */
1170       while (m-- > 0) {
1171         sendbuf[cnt++] = *b_sendbuf++;
1172         b_sendj++;
1173       }
1174     }
1175     PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1176 
1177     /* -----------------------------------------------------------------
1178        Gather all numerical values to all processors
1179     */
1180     if (!recvcounts) {
1181       PetscCall(PetscMalloc2(size,&recvcounts,size,&displs));
1182     }
1183     for (i=0; i<size; i++) {
1184       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1185     }
1186     displs[0] = 0;
1187     for (i=1; i<size; i++) {
1188       displs[i] = displs[i-1] + recvcounts[i-1];
1189     }
1190     recvbuf = b->a;
1191     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A)));
1192     PetscCall(MatSeqAIJRestoreArrayRead(a->A,&ada));
1193     PetscCall(MatSeqAIJRestoreArrayRead(a->B,&bda));
1194   }  /* endof (flag == MAT_GET_VALUES) */
1195   PetscCall(PetscFree2(recvcounts,displs));
1196 
1197   PetscCall(MatPropagateSymmetryOptions(A,B));
1198   PetscFunctionReturn(0);
1199 }
1200 
1201 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1202 {
1203   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1204   Mat            submat,A = c->A,B = c->B;
1205   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1206   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1207   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1208   const PetscInt *icol,*irow;
1209   PetscInt       nrow,ncol,start;
1210   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1211   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1212   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1213   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1214   PetscInt       *lens,rmax,ncols,*cols,Crow;
1215 #if defined(PETSC_USE_CTABLE)
1216   PetscTable     cmap,rmap;
1217   PetscInt       *cmap_loc,*rmap_loc;
1218 #else
1219   PetscInt       *cmap,*rmap;
1220 #endif
1221   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1222   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1223   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1224   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1225   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1226   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1227   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1228   MPI_Comm       comm;
1229   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1230   PetscMPIInt    *onodes1,*olengths1,idex,end;
1231   Mat_SubSppt    *smatis1;
1232   PetscBool      isrowsorted,iscolsorted;
1233 
1234   PetscFunctionBegin;
1235   PetscValidLogicalCollectiveInt(C,ismax,2);
1236   PetscValidLogicalCollectiveEnum(C,scall,5);
1237   PetscCheck(ismax == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1238   PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a));
1239   PetscCall(MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a));
1240   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
1241   size = c->size;
1242   rank = c->rank;
1243 
1244   PetscCall(ISSorted(iscol[0],&iscolsorted));
1245   PetscCall(ISSorted(isrow[0],&isrowsorted));
1246   PetscCall(ISGetIndices(isrow[0],&irow));
1247   PetscCall(ISGetLocalSize(isrow[0],&nrow));
1248   if (allcolumns) {
1249     icol = NULL;
1250     ncol = C->cmap->N;
1251   } else {
1252     PetscCall(ISGetIndices(iscol[0],&icol));
1253     PetscCall(ISGetLocalSize(iscol[0],&ncol));
1254   }
1255 
1256   if (scall == MAT_INITIAL_MATRIX) {
1257     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1258 
1259     /* Get some new tags to keep the communication clean */
1260     tag1 = ((PetscObject)C)->tag;
1261     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
1262     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3));
1263 
1264     /* evaluate communication - mesg to who, length of mesg, and buffer space
1265      required. Based on this, buffers are allocated, and data copied into them */
1266     PetscCall(PetscCalloc2(size,&w1,size,&w2));
1267     PetscCall(PetscMalloc1(nrow,&row2proc));
1268 
1269     /* w1[proc] = num of rows owned by proc -- to be requested */
1270     proc = 0;
1271     nrqs = 0; /* num of outgoing messages */
1272     for (j=0; j<nrow; j++) {
1273       row  = irow[j];
1274       if (!isrowsorted) proc = 0;
1275       while (row >= C->rmap->range[proc+1]) proc++;
1276       w1[proc]++;
1277       row2proc[j] = proc; /* map row index to proc */
1278 
1279       if (proc != rank && !w2[proc]) {
1280         w2[proc] = 1; nrqs++;
1281       }
1282     }
1283     w1[rank] = 0;  /* rows owned by self will not be requested */
1284 
1285     PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/
1286     for (proc=0,j=0; proc<size; proc++) {
1287       if (w1[proc]) { pa[j++] = proc;}
1288     }
1289 
1290     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1291     msz = 0;              /* total mesg length (for all procs) */
1292     for (i=0; i<nrqs; i++) {
1293       proc      = pa[i];
1294       w1[proc] += 3;
1295       msz      += w1[proc];
1296     }
1297     PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz));
1298 
1299     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1300     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1301     PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
1302 
1303     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1304        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1305     PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
1306 
1307     /* Now post the Irecvs corresponding to these messages */
1308     PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1));
1309 
1310     PetscCall(PetscFree(onodes1));
1311     PetscCall(PetscFree(olengths1));
1312 
1313     /* Allocate Memory for outgoing messages */
1314     PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
1315     PetscCall(PetscArrayzero(sbuf1,size));
1316     PetscCall(PetscArrayzero(ptr,size));
1317 
1318     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1319     iptr = tmp;
1320     for (i=0; i<nrqs; i++) {
1321       proc        = pa[i];
1322       sbuf1[proc] = iptr;
1323       iptr       += w1[proc];
1324     }
1325 
1326     /* Form the outgoing messages */
1327     /* Initialize the header space */
1328     for (i=0; i<nrqs; i++) {
1329       proc      = pa[i];
1330       PetscCall(PetscArrayzero(sbuf1[proc],3));
1331       ptr[proc] = sbuf1[proc] + 3;
1332     }
1333 
1334     /* Parse the isrow and copy data into outbuf */
1335     PetscCall(PetscArrayzero(ctr,size));
1336     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1337       proc = row2proc[j];
1338       if (proc != rank) { /* copy to the outgoing buf*/
1339         *ptr[proc] = irow[j];
1340         ctr[proc]++; ptr[proc]++;
1341       }
1342     }
1343 
1344     /* Update the headers for the current IS */
1345     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1346       if ((ctr_j = ctr[j])) {
1347         sbuf1_j        = sbuf1[j];
1348         k              = ++sbuf1_j[0];
1349         sbuf1_j[2*k]   = ctr_j;
1350         sbuf1_j[2*k-1] = 0;
1351       }
1352     }
1353 
1354     /* Now post the sends */
1355     PetscCall(PetscMalloc1(nrqs,&s_waits1));
1356     for (i=0; i<nrqs; ++i) {
1357       proc = pa[i];
1358       PetscCallMPI(MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i));
1359     }
1360 
1361     /* Post Receives to capture the buffer size */
1362     PetscCall(PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2));
1363     PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3));
1364 
1365     if (nrqs) rbuf2[0] = tmp + msz;
1366     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1367 
1368     for (i=0; i<nrqs; ++i) {
1369       proc = pa[i];
1370       PetscCallMPI(MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i));
1371     }
1372 
1373     PetscCall(PetscFree2(w1,w2));
1374 
1375     /* Send to other procs the buf size they should allocate */
1376     /* Receive messages*/
1377     PetscCall(PetscMalloc1(nrqr,&r_status1));
1378     PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1));
1379 
1380     PetscCallMPI(MPI_Waitall(nrqr,r_waits1,r_status1));
1381     for (i=0; i<nrqr; ++i) {
1382       req_size[i] = 0;
1383       rbuf1_i        = rbuf1[i];
1384       start          = 2*rbuf1_i[0] + 1;
1385       PetscCallMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end));
1386       PetscCall(PetscMalloc1(end,&sbuf2[i]));
1387       sbuf2_i        = sbuf2[i];
1388       for (j=start; j<end; j++) {
1389         k            = rbuf1_i[j] - rstart;
1390         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1391         sbuf2_i[j]   = ncols;
1392         req_size[i] += ncols;
1393       }
1394       req_source1[i] = r_status1[i].MPI_SOURCE;
1395 
1396       /* form the header */
1397       sbuf2_i[0] = req_size[i];
1398       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1399 
1400       PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i));
1401     }
1402 
1403     PetscCall(PetscFree(r_status1));
1404     PetscCall(PetscFree(r_waits1));
1405 
1406     /* rbuf2 is received, Post recv column indices a->j */
1407     PetscCallMPI(MPI_Waitall(nrqs,r_waits2,r_status2));
1408 
1409     PetscCall(PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3));
1410     for (i=0; i<nrqs; ++i) {
1411       PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i]));
1412       req_source2[i] = r_status2[i].MPI_SOURCE;
1413       PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i));
1414     }
1415 
1416     /* Wait on sends1 and sends2 */
1417     PetscCall(PetscMalloc1(nrqs,&s_status1));
1418     PetscCallMPI(MPI_Waitall(nrqs,s_waits1,s_status1));
1419     PetscCall(PetscFree(s_waits1));
1420     PetscCall(PetscFree(s_status1));
1421 
1422     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,s_status2));
1423     PetscCall(PetscFree4(r_status2,s_waits2,r_waits2,s_status2));
1424 
1425     /* Now allocate sending buffers for a->j, and send them off */
1426     PetscCall(PetscMalloc1(nrqr,&sbuf_aj));
1427     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1428     if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0]));
1429     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1430 
1431     for (i=0; i<nrqr; i++) { /* for each requested message */
1432       rbuf1_i   = rbuf1[i];
1433       sbuf_aj_i = sbuf_aj[i];
1434       ct1       = 2*rbuf1_i[0] + 1;
1435       ct2       = 0;
1436       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1437 
1438       kmax = rbuf1[i][2];
1439       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1440         row    = rbuf1_i[ct1] - rstart;
1441         nzA    = ai[row+1] - ai[row];
1442         nzB    = bi[row+1] - bi[row];
1443         ncols  = nzA + nzB;
1444         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1445 
1446         /* load the column indices for this row into cols*/
1447         cols = sbuf_aj_i + ct2;
1448 
1449         lwrite = 0;
1450         for (l=0; l<nzB; l++) {
1451           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1452         }
1453         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1454         for (l=0; l<nzB; l++) {
1455           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1456         }
1457 
1458         ct2 += ncols;
1459       }
1460       PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i));
1461     }
1462 
1463     /* create column map (cmap): global col of C -> local col of submat */
1464 #if defined(PETSC_USE_CTABLE)
1465     if (!allcolumns) {
1466       PetscCall(PetscTableCreate(ncol,C->cmap->N,&cmap));
1467       PetscCall(PetscCalloc1(C->cmap->n,&cmap_loc));
1468       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1469         if (icol[j] >= cstart && icol[j] <cend) {
1470           cmap_loc[icol[j] - cstart] = j+1;
1471         } else { /* use PetscTable for non-local col indices */
1472           PetscCall(PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES));
1473         }
1474       }
1475     } else {
1476       cmap     = NULL;
1477       cmap_loc = NULL;
1478     }
1479     PetscCall(PetscCalloc1(C->rmap->n,&rmap_loc));
1480 #else
1481     if (!allcolumns) {
1482       PetscCall(PetscCalloc1(C->cmap->N,&cmap));
1483       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1484     } else {
1485       cmap = NULL;
1486     }
1487 #endif
1488 
1489     /* Create lens for MatSeqAIJSetPreallocation() */
1490     PetscCall(PetscCalloc1(nrow,&lens));
1491 
1492     /* Compute lens from local part of C */
1493     for (j=0; j<nrow; j++) {
1494       row  = irow[j];
1495       proc = row2proc[j];
1496       if (proc == rank) {
1497         /* diagonal part A = c->A */
1498         ncols = ai[row-rstart+1] - ai[row-rstart];
1499         cols  = aj + ai[row-rstart];
1500         if (!allcolumns) {
1501           for (k=0; k<ncols; k++) {
1502 #if defined(PETSC_USE_CTABLE)
1503             tcol = cmap_loc[cols[k]];
1504 #else
1505             tcol = cmap[cols[k]+cstart];
1506 #endif
1507             if (tcol) lens[j]++;
1508           }
1509         } else { /* allcolumns */
1510           lens[j] = ncols;
1511         }
1512 
1513         /* off-diagonal part B = c->B */
1514         ncols = bi[row-rstart+1] - bi[row-rstart];
1515         cols  = bj + bi[row-rstart];
1516         if (!allcolumns) {
1517           for (k=0; k<ncols; k++) {
1518 #if defined(PETSC_USE_CTABLE)
1519             PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol));
1520 #else
1521             tcol = cmap[bmap[cols[k]]];
1522 #endif
1523             if (tcol) lens[j]++;
1524           }
1525         } else { /* allcolumns */
1526           lens[j] += ncols;
1527         }
1528       }
1529     }
1530 
1531     /* Create row map (rmap): global row of C -> local row of submat */
1532 #if defined(PETSC_USE_CTABLE)
1533     PetscCall(PetscTableCreate(nrow,C->rmap->N,&rmap));
1534     for (j=0; j<nrow; j++) {
1535       row  = irow[j];
1536       proc = row2proc[j];
1537       if (proc == rank) { /* a local row */
1538         rmap_loc[row - rstart] = j;
1539       } else {
1540         PetscCall(PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES));
1541       }
1542     }
1543 #else
1544     PetscCall(PetscCalloc1(C->rmap->N,&rmap));
1545     for (j=0; j<nrow; j++) {
1546       rmap[irow[j]] = j;
1547     }
1548 #endif
1549 
1550     /* Update lens from offproc data */
1551     /* recv a->j is done */
1552     PetscCallMPI(MPI_Waitall(nrqs,r_waits3,r_status3));
1553     for (i=0; i<nrqs; i++) {
1554       proc    = pa[i];
1555       sbuf1_i = sbuf1[proc];
1556       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1557       ct1     = 2 + 1;
1558       ct2     = 0;
1559       rbuf2_i = rbuf2[i]; /* received length of C->j */
1560       rbuf3_i = rbuf3[i]; /* received C->j */
1561 
1562       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1563       max1   = sbuf1_i[2];
1564       for (k=0; k<max1; k++,ct1++) {
1565 #if defined(PETSC_USE_CTABLE)
1566         PetscCall(PetscTableFind(rmap,sbuf1_i[ct1]+1,&row));
1567         row--;
1568         PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1569 #else
1570         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1571 #endif
1572         /* Now, store row index of submat in sbuf1_i[ct1] */
1573         sbuf1_i[ct1] = row;
1574 
1575         nnz = rbuf2_i[ct1];
1576         if (!allcolumns) {
1577           for (l=0; l<nnz; l++,ct2++) {
1578 #if defined(PETSC_USE_CTABLE)
1579             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1580               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1581             } else {
1582               PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol));
1583             }
1584 #else
1585             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1586 #endif
1587             if (tcol) lens[row]++;
1588           }
1589         } else { /* allcolumns */
1590           lens[row] += nnz;
1591         }
1592       }
1593     }
1594     PetscCallMPI(MPI_Waitall(nrqr,s_waits3,s_status3));
1595     PetscCall(PetscFree4(r_waits3,s_waits3,r_status3,s_status3));
1596 
1597     /* Create the submatrices */
1598     PetscCall(MatCreate(PETSC_COMM_SELF,&submat));
1599     PetscCall(MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE));
1600 
1601     PetscCall(ISGetBlockSize(isrow[0],&i));
1602     PetscCall(ISGetBlockSize(iscol[0],&j));
1603     PetscCall(MatSetBlockSizes(submat,i,j));
1604     PetscCall(MatSetType(submat,((PetscObject)A)->type_name));
1605     PetscCall(MatSeqAIJSetPreallocation(submat,0,lens));
1606 
1607     /* create struct Mat_SubSppt and attached it to submat */
1608     PetscCall(PetscNew(&smatis1));
1609     subc = (Mat_SeqAIJ*)submat->data;
1610     subc->submatis1 = smatis1;
1611 
1612     smatis1->id          = 0;
1613     smatis1->nrqs        = nrqs;
1614     smatis1->nrqr        = nrqr;
1615     smatis1->rbuf1       = rbuf1;
1616     smatis1->rbuf2       = rbuf2;
1617     smatis1->rbuf3       = rbuf3;
1618     smatis1->sbuf2       = sbuf2;
1619     smatis1->req_source2 = req_source2;
1620 
1621     smatis1->sbuf1       = sbuf1;
1622     smatis1->ptr         = ptr;
1623     smatis1->tmp         = tmp;
1624     smatis1->ctr         = ctr;
1625 
1626     smatis1->pa           = pa;
1627     smatis1->req_size     = req_size;
1628     smatis1->req_source1  = req_source1;
1629 
1630     smatis1->allcolumns  = allcolumns;
1631     smatis1->singleis    = PETSC_TRUE;
1632     smatis1->row2proc    = row2proc;
1633     smatis1->rmap        = rmap;
1634     smatis1->cmap        = cmap;
1635 #if defined(PETSC_USE_CTABLE)
1636     smatis1->rmap_loc    = rmap_loc;
1637     smatis1->cmap_loc    = cmap_loc;
1638 #endif
1639 
1640     smatis1->destroy     = submat->ops->destroy;
1641     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1642     submat->factortype   = C->factortype;
1643 
1644     /* compute rmax */
1645     rmax = 0;
1646     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1647 
1648   } else { /* scall == MAT_REUSE_MATRIX */
1649     submat = submats[0];
1650     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1651 
1652     subc    = (Mat_SeqAIJ*)submat->data;
1653     rmax    = subc->rmax;
1654     smatis1 = subc->submatis1;
1655     nrqs        = smatis1->nrqs;
1656     nrqr        = smatis1->nrqr;
1657     rbuf1       = smatis1->rbuf1;
1658     rbuf2       = smatis1->rbuf2;
1659     rbuf3       = smatis1->rbuf3;
1660     req_source2 = smatis1->req_source2;
1661 
1662     sbuf1     = smatis1->sbuf1;
1663     sbuf2     = smatis1->sbuf2;
1664     ptr       = smatis1->ptr;
1665     tmp       = smatis1->tmp;
1666     ctr       = smatis1->ctr;
1667 
1668     pa         = smatis1->pa;
1669     req_size   = smatis1->req_size;
1670     req_source1 = smatis1->req_source1;
1671 
1672     allcolumns = smatis1->allcolumns;
1673     row2proc   = smatis1->row2proc;
1674     rmap       = smatis1->rmap;
1675     cmap       = smatis1->cmap;
1676 #if defined(PETSC_USE_CTABLE)
1677     rmap_loc   = smatis1->rmap_loc;
1678     cmap_loc   = smatis1->cmap_loc;
1679 #endif
1680   }
1681 
1682   /* Post recv matrix values */
1683   PetscCall(PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals));
1684   PetscCall(PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4));
1685   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4));
1686   for (i=0; i<nrqs; ++i) {
1687     PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i]));
1688     PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i));
1689   }
1690 
1691   /* Allocate sending buffers for a->a, and send them off */
1692   PetscCall(PetscMalloc1(nrqr,&sbuf_aa));
1693   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1694   if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0]));
1695   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1696 
1697   for (i=0; i<nrqr; i++) {
1698     rbuf1_i   = rbuf1[i];
1699     sbuf_aa_i = sbuf_aa[i];
1700     ct1       = 2*rbuf1_i[0]+1;
1701     ct2       = 0;
1702     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1703 
1704     kmax = rbuf1_i[2];
1705     for (k=0; k<kmax; k++,ct1++) {
1706       row = rbuf1_i[ct1] - rstart;
1707       nzA = ai[row+1] - ai[row];
1708       nzB = bi[row+1] - bi[row];
1709       ncols  = nzA + nzB;
1710       cworkB = bj + bi[row];
1711       vworkA = a_a + ai[row];
1712       vworkB = b_a + bi[row];
1713 
1714       /* load the column values for this row into vals*/
1715       vals = sbuf_aa_i + ct2;
1716 
1717       lwrite = 0;
1718       for (l=0; l<nzB; l++) {
1719         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1720       }
1721       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1722       for (l=0; l<nzB; l++) {
1723         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1724       }
1725 
1726       ct2 += ncols;
1727     }
1728     PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i));
1729   }
1730 
1731   /* Assemble submat */
1732   /* First assemble the local rows */
1733   for (j=0; j<nrow; j++) {
1734     row  = irow[j];
1735     proc = row2proc[j];
1736     if (proc == rank) {
1737       Crow = row - rstart;  /* local row index of C */
1738 #if defined(PETSC_USE_CTABLE)
1739       row = rmap_loc[Crow]; /* row index of submat */
1740 #else
1741       row = rmap[row];
1742 #endif
1743 
1744       if (allcolumns) {
1745         /* diagonal part A = c->A */
1746         ncols = ai[Crow+1] - ai[Crow];
1747         cols  = aj + ai[Crow];
1748         vals  = a_a + ai[Crow];
1749         i     = 0;
1750         for (k=0; k<ncols; k++) {
1751           subcols[i]   = cols[k] + cstart;
1752           subvals[i++] = vals[k];
1753         }
1754 
1755         /* off-diagonal part B = c->B */
1756         ncols = bi[Crow+1] - bi[Crow];
1757         cols  = bj + bi[Crow];
1758         vals  = b_a + bi[Crow];
1759         for (k=0; k<ncols; k++) {
1760           subcols[i]   = bmap[cols[k]];
1761           subvals[i++] = vals[k];
1762         }
1763 
1764         PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES));
1765 
1766       } else { /* !allcolumns */
1767 #if defined(PETSC_USE_CTABLE)
1768         /* diagonal part A = c->A */
1769         ncols = ai[Crow+1] - ai[Crow];
1770         cols  = aj + ai[Crow];
1771         vals  = a_a + ai[Crow];
1772         i     = 0;
1773         for (k=0; k<ncols; k++) {
1774           tcol = cmap_loc[cols[k]];
1775           if (tcol) {
1776             subcols[i]   = --tcol;
1777             subvals[i++] = vals[k];
1778           }
1779         }
1780 
1781         /* off-diagonal part B = c->B */
1782         ncols = bi[Crow+1] - bi[Crow];
1783         cols  = bj + bi[Crow];
1784         vals  = b_a + bi[Crow];
1785         for (k=0; k<ncols; k++) {
1786           PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol));
1787           if (tcol) {
1788             subcols[i]   = --tcol;
1789             subvals[i++] = vals[k];
1790           }
1791         }
1792 #else
1793         /* diagonal part A = c->A */
1794         ncols = ai[Crow+1] - ai[Crow];
1795         cols  = aj + ai[Crow];
1796         vals  = a_a + ai[Crow];
1797         i     = 0;
1798         for (k=0; k<ncols; k++) {
1799           tcol = cmap[cols[k]+cstart];
1800           if (tcol) {
1801             subcols[i]   = --tcol;
1802             subvals[i++] = vals[k];
1803           }
1804         }
1805 
1806         /* off-diagonal part B = c->B */
1807         ncols = bi[Crow+1] - bi[Crow];
1808         cols  = bj + bi[Crow];
1809         vals  = b_a + bi[Crow];
1810         for (k=0; k<ncols; k++) {
1811           tcol = cmap[bmap[cols[k]]];
1812           if (tcol) {
1813             subcols[i]   = --tcol;
1814             subvals[i++] = vals[k];
1815           }
1816         }
1817 #endif
1818         PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES));
1819       }
1820     }
1821   }
1822 
1823   /* Now assemble the off-proc rows */
1824   for (i=0; i<nrqs; i++) { /* for each requested message */
1825     /* recv values from other processes */
1826     PetscCallMPI(MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i));
1827     proc    = pa[idex];
1828     sbuf1_i = sbuf1[proc];
1829     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1830     ct1     = 2 + 1;
1831     ct2     = 0; /* count of received C->j */
1832     ct3     = 0; /* count of received C->j that will be inserted into submat */
1833     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1834     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1835     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1836 
1837     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1838     max1 = sbuf1_i[2];             /* num of rows */
1839     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1840       row = sbuf1_i[ct1]; /* row index of submat */
1841       if (!allcolumns) {
1842         idex = 0;
1843         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1844           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1845           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1846 #if defined(PETSC_USE_CTABLE)
1847             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1848               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1849             } else {
1850               PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol));
1851             }
1852 #else
1853             tcol = cmap[rbuf3_i[ct2]];
1854 #endif
1855             if (tcol) {
1856               subcols[idex]   = --tcol; /* may not be sorted */
1857               subvals[idex++] = rbuf4_i[ct2];
1858 
1859               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1860                For reuse, we replace received C->j with index that should be inserted to submat */
1861               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1862             }
1863           }
1864           PetscCall(MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES));
1865         } else { /* scall == MAT_REUSE_MATRIX */
1866           submat = submats[0];
1867           subc   = (Mat_SeqAIJ*)submat->data;
1868 
1869           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1870           for (l=0; l<nnz; l++) {
1871             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1872             subvals[idex++] = rbuf4_i[ct2];
1873           }
1874 
1875           bj = subc->j + subc->i[row]; /* sorted column indices */
1876           PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES));
1877         }
1878       } else { /* allcolumns */
1879         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1880         PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES));
1881         ct2 += nnz;
1882       }
1883     }
1884   }
1885 
1886   /* sending a->a are done */
1887   PetscCallMPI(MPI_Waitall(nrqr,s_waits4,s_status4));
1888   PetscCall(PetscFree4(r_waits4,s_waits4,r_status4,s_status4));
1889 
1890   PetscCall(MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY));
1891   PetscCall(MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY));
1892   submats[0] = submat;
1893 
1894   /* Restore the indices */
1895   PetscCall(ISRestoreIndices(isrow[0],&irow));
1896   if (!allcolumns) {
1897     PetscCall(ISRestoreIndices(iscol[0],&icol));
1898   }
1899 
1900   /* Destroy allocated memory */
1901   for (i=0; i<nrqs; ++i) {
1902     PetscCall(PetscFree(rbuf4[i]));
1903   }
1904   PetscCall(PetscFree3(rbuf4,subcols,subvals));
1905   if (sbuf_aa) {
1906     PetscCall(PetscFree(sbuf_aa[0]));
1907     PetscCall(PetscFree(sbuf_aa));
1908   }
1909 
1910   if (scall == MAT_INITIAL_MATRIX) {
1911     PetscCall(PetscFree(lens));
1912     if (sbuf_aj) {
1913       PetscCall(PetscFree(sbuf_aj[0]));
1914       PetscCall(PetscFree(sbuf_aj));
1915     }
1916   }
1917   PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a));
1918   PetscCall(MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a));
1919   PetscFunctionReturn(0);
1920 }
1921 
1922 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1923 {
1924   PetscInt       ncol;
1925   PetscBool      colflag,allcolumns=PETSC_FALSE;
1926 
1927   PetscFunctionBegin;
1928   /* Allocate memory to hold all the submatrices */
1929   if (scall == MAT_INITIAL_MATRIX) {
1930     PetscCall(PetscCalloc1(2,submat));
1931   }
1932 
1933   /* Check for special case: each processor gets entire matrix columns */
1934   PetscCall(ISIdentity(iscol[0],&colflag));
1935   PetscCall(ISGetLocalSize(iscol[0],&ncol));
1936   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1937 
1938   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat));
1939   PetscFunctionReturn(0);
1940 }
1941 
1942 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1943 {
1944   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1945   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1946   Mat_SeqAIJ     *subc;
1947   Mat_SubSppt    *smat;
1948 
1949   PetscFunctionBegin;
1950   /* Check for special case: each processor has a single IS */
1951   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1952     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat));
1953     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1954     PetscFunctionReturn(0);
1955   }
1956 
1957   /* Collect global wantallmatrix and nstages */
1958   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1959   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1960   if (!nmax) nmax = 1;
1961 
1962   if (scall == MAT_INITIAL_MATRIX) {
1963     /* Collect global wantallmatrix and nstages */
1964     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1965       PetscCall(ISIdentity(*isrow,&rowflag));
1966       PetscCall(ISIdentity(*iscol,&colflag));
1967       PetscCall(ISGetLocalSize(*isrow,&nrow));
1968       PetscCall(ISGetLocalSize(*iscol,&ncol));
1969       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1970         wantallmatrix = PETSC_TRUE;
1971 
1972         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL));
1973       }
1974     }
1975 
1976     /* Determine the number of stages through which submatrices are done
1977        Each stage will extract nmax submatrices.
1978        nmax is determined by the matrix column dimension.
1979        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1980     */
1981     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1982 
1983     in[0] = -1*(PetscInt)wantallmatrix;
1984     in[1] = nstages;
1985     PetscCall(MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
1986     wantallmatrix = (PetscBool)(-out[0]);
1987     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1988 
1989   } else { /* MAT_REUSE_MATRIX */
1990     if (ismax) {
1991       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
1992       smat = subc->submatis1;
1993     } else { /* (*submat)[0] is a dummy matrix */
1994       smat = (Mat_SubSppt*)(*submat)[0]->data;
1995     }
1996     if (!smat) {
1997       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1998       wantallmatrix = PETSC_TRUE;
1999     } else if (smat->singleis) {
2000       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat));
2001       PetscFunctionReturn(0);
2002     } else {
2003       nstages = smat->nstages;
2004     }
2005   }
2006 
2007   if (wantallmatrix) {
2008     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat));
2009     PetscFunctionReturn(0);
2010   }
2011 
2012   /* Allocate memory to hold all the submatrices and dummy submatrices */
2013   if (scall == MAT_INITIAL_MATRIX) {
2014     PetscCall(PetscCalloc1(ismax+nstages,submat));
2015   }
2016 
2017   for (i=0,pos=0; i<nstages; i++) {
2018     if (pos+nmax <= ismax) max_no = nmax;
2019     else if (pos >= ismax) max_no = 0;
2020     else                   max_no = ismax-pos;
2021 
2022     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos));
2023     if (!max_no) {
2024       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2025         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2026         smat->nstages = nstages;
2027       }
2028       pos++; /* advance to next dummy matrix if any */
2029     } else pos += max_no;
2030   }
2031 
2032   if (ismax && scall == MAT_INITIAL_MATRIX) {
2033     /* save nstages for reuse */
2034     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2035     smat = subc->submatis1;
2036     smat->nstages = nstages;
2037   }
2038   PetscFunctionReturn(0);
2039 }
2040 
2041 /* -------------------------------------------------------------------------*/
2042 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2043 {
2044   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2045   Mat            A  = c->A;
2046   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2047   const PetscInt **icol,**irow;
2048   PetscInt       *nrow,*ncol,start;
2049   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2050   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2051   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2052   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2053   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2054 #if defined(PETSC_USE_CTABLE)
2055   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2056 #else
2057   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2058 #endif
2059   const PetscInt *irow_i;
2060   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2061   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2062   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2063   MPI_Comm       comm;
2064   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2065   PetscMPIInt    *onodes1,*olengths1,end;
2066   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2067   Mat_SubSppt    *smat_i;
2068   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2069   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2070 
2071   PetscFunctionBegin;
2072   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
2073   size = c->size;
2074   rank = c->rank;
2075 
2076   PetscCall(PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns));
2077   PetscCall(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted));
2078 
2079   for (i=0; i<ismax; i++) {
2080     PetscCall(ISSorted(iscol[i],&issorted[i]));
2081     if (!issorted[i]) iscsorted = issorted[i];
2082 
2083     PetscCall(ISSorted(isrow[i],&issorted[i]));
2084 
2085     PetscCall(ISGetIndices(isrow[i],&irow[i]));
2086     PetscCall(ISGetLocalSize(isrow[i],&nrow[i]));
2087 
2088     /* Check for special case: allcolumn */
2089     PetscCall(ISIdentity(iscol[i],&colflag));
2090     PetscCall(ISGetLocalSize(iscol[i],&ncol[i]));
2091     if (colflag && ncol[i] == C->cmap->N) {
2092       allcolumns[i] = PETSC_TRUE;
2093       icol[i] = NULL;
2094     } else {
2095       allcolumns[i] = PETSC_FALSE;
2096       PetscCall(ISGetIndices(iscol[i],&icol[i]));
2097     }
2098   }
2099 
2100   if (scall == MAT_REUSE_MATRIX) {
2101     /* Assumes new rows are same length as the old rows */
2102     for (i=0; i<ismax; i++) {
2103       PetscCheck(submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i);
2104       subc = (Mat_SeqAIJ*)submats[i]->data;
2105       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2106 
2107       /* Initial matrix as if empty */
2108       PetscCall(PetscArrayzero(subc->ilen,submats[i]->rmap->n));
2109 
2110       smat_i   = subc->submatis1;
2111 
2112       nrqs        = smat_i->nrqs;
2113       nrqr        = smat_i->nrqr;
2114       rbuf1       = smat_i->rbuf1;
2115       rbuf2       = smat_i->rbuf2;
2116       rbuf3       = smat_i->rbuf3;
2117       req_source2 = smat_i->req_source2;
2118 
2119       sbuf1     = smat_i->sbuf1;
2120       sbuf2     = smat_i->sbuf2;
2121       ptr       = smat_i->ptr;
2122       tmp       = smat_i->tmp;
2123       ctr       = smat_i->ctr;
2124 
2125       pa          = smat_i->pa;
2126       req_size    = smat_i->req_size;
2127       req_source1 = smat_i->req_source1;
2128 
2129       allcolumns[i] = smat_i->allcolumns;
2130       row2proc[i]   = smat_i->row2proc;
2131       rmap[i]       = smat_i->rmap;
2132       cmap[i]       = smat_i->cmap;
2133     }
2134 
2135     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2136       PetscCheck(submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2137       smat_i = (Mat_SubSppt*)submats[0]->data;
2138 
2139       nrqs        = smat_i->nrqs;
2140       nrqr        = smat_i->nrqr;
2141       rbuf1       = smat_i->rbuf1;
2142       rbuf2       = smat_i->rbuf2;
2143       rbuf3       = smat_i->rbuf3;
2144       req_source2 = smat_i->req_source2;
2145 
2146       sbuf1       = smat_i->sbuf1;
2147       sbuf2       = smat_i->sbuf2;
2148       ptr         = smat_i->ptr;
2149       tmp         = smat_i->tmp;
2150       ctr         = smat_i->ctr;
2151 
2152       pa          = smat_i->pa;
2153       req_size    = smat_i->req_size;
2154       req_source1 = smat_i->req_source1;
2155 
2156       allcolumns[0] = PETSC_FALSE;
2157     }
2158   } else { /* scall == MAT_INITIAL_MATRIX */
2159     /* Get some new tags to keep the communication clean */
2160     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
2161     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3));
2162 
2163     /* evaluate communication - mesg to who, length of mesg, and buffer space
2164      required. Based on this, buffers are allocated, and data copied into them*/
2165     PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));   /* mesg size, initialize work vectors */
2166 
2167     for (i=0; i<ismax; i++) {
2168       jmax   = nrow[i];
2169       irow_i = irow[i];
2170 
2171       PetscCall(PetscMalloc1(jmax,&row2proc_i));
2172       row2proc[i] = row2proc_i;
2173 
2174       if (issorted[i]) proc = 0;
2175       for (j=0; j<jmax; j++) {
2176         if (!issorted[i]) proc = 0;
2177         row = irow_i[j];
2178         while (row >= C->rmap->range[proc+1]) proc++;
2179         w4[proc]++;
2180         row2proc_i[j] = proc; /* map row index to proc */
2181       }
2182       for (j=0; j<size; j++) {
2183         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2184       }
2185     }
2186 
2187     nrqs     = 0;              /* no of outgoing messages */
2188     msz      = 0;              /* total mesg length (for all procs) */
2189     w1[rank] = 0;              /* no mesg sent to self */
2190     w3[rank] = 0;
2191     for (i=0; i<size; i++) {
2192       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2193     }
2194     PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/
2195     for (i=0,j=0; i<size; i++) {
2196       if (w1[i]) { pa[j] = i; j++; }
2197     }
2198 
2199     /* Each message would have a header = 1 + 2*(no of IS) + data */
2200     for (i=0; i<nrqs; i++) {
2201       j      = pa[i];
2202       w1[j] += w2[j] + 2* w3[j];
2203       msz   += w1[j];
2204     }
2205     PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz));
2206 
2207     /* Determine the number of messages to expect, their lengths, from from-ids */
2208     PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
2209     PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
2210 
2211     /* Now post the Irecvs corresponding to these messages */
2212     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag0));
2213     PetscCall(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1));
2214 
2215     /* Allocate Memory for outgoing messages */
2216     PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
2217     PetscCall(PetscArrayzero(sbuf1,size));
2218     PetscCall(PetscArrayzero(ptr,size));
2219 
2220     {
2221       PetscInt *iptr = tmp;
2222       k    = 0;
2223       for (i=0; i<nrqs; i++) {
2224         j        = pa[i];
2225         iptr    += k;
2226         sbuf1[j] = iptr;
2227         k        = w1[j];
2228       }
2229     }
2230 
2231     /* Form the outgoing messages. Initialize the header space */
2232     for (i=0; i<nrqs; i++) {
2233       j           = pa[i];
2234       sbuf1[j][0] = 0;
2235       PetscCall(PetscArrayzero(sbuf1[j]+1,2*w3[j]));
2236       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2237     }
2238 
2239     /* Parse the isrow and copy data into outbuf */
2240     for (i=0; i<ismax; i++) {
2241       row2proc_i = row2proc[i];
2242       PetscCall(PetscArrayzero(ctr,size));
2243       irow_i = irow[i];
2244       jmax   = nrow[i];
2245       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2246         proc = row2proc_i[j];
2247         if (proc != rank) { /* copy to the outgoing buf*/
2248           ctr[proc]++;
2249           *ptr[proc] = irow_i[j];
2250           ptr[proc]++;
2251         }
2252       }
2253       /* Update the headers for the current IS */
2254       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2255         if ((ctr_j = ctr[j])) {
2256           sbuf1_j        = sbuf1[j];
2257           k              = ++sbuf1_j[0];
2258           sbuf1_j[2*k]   = ctr_j;
2259           sbuf1_j[2*k-1] = i;
2260         }
2261       }
2262     }
2263 
2264     /*  Now  post the sends */
2265     PetscCall(PetscMalloc1(nrqs,&s_waits1));
2266     for (i=0; i<nrqs; ++i) {
2267       j    = pa[i];
2268       PetscCallMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i));
2269     }
2270 
2271     /* Post Receives to capture the buffer size */
2272     PetscCall(PetscMalloc1(nrqs,&r_waits2));
2273     PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3));
2274     if (nrqs) rbuf2[0] = tmp + msz;
2275     for (i=1; i<nrqs; ++i) {
2276       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2277     }
2278     for (i=0; i<nrqs; ++i) {
2279       j    = pa[i];
2280       PetscCallMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i));
2281     }
2282 
2283     /* Send to other procs the buf size they should allocate */
2284     /* Receive messages*/
2285     PetscCall(PetscMalloc1(nrqr,&s_waits2));
2286     PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1));
2287     {
2288       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2289       PetscInt   *sbuf2_i;
2290 
2291       PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE));
2292       for (i=0; i<nrqr; ++i) {
2293         req_size[i] = 0;
2294         rbuf1_i        = rbuf1[i];
2295         start          = 2*rbuf1_i[0] + 1;
2296         end            = olengths1[i];
2297         PetscCall(PetscMalloc1(end,&sbuf2[i]));
2298         sbuf2_i        = sbuf2[i];
2299         for (j=start; j<end; j++) {
2300           id              = rbuf1_i[j] - rstart;
2301           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2302           sbuf2_i[j]      = ncols;
2303           req_size[i] += ncols;
2304         }
2305         req_source1[i] = onodes1[i];
2306         /* form the header */
2307         sbuf2_i[0] = req_size[i];
2308         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2309 
2310         PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i));
2311       }
2312     }
2313 
2314     PetscCall(PetscFree(onodes1));
2315     PetscCall(PetscFree(olengths1));
2316     PetscCall(PetscFree(r_waits1));
2317     PetscCall(PetscFree4(w1,w2,w3,w4));
2318 
2319     /* Receive messages*/
2320     PetscCall(PetscMalloc1(nrqs,&r_waits3));
2321     PetscCallMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE));
2322     for (i=0; i<nrqs; ++i) {
2323       PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i]));
2324       req_source2[i] = pa[i];
2325       PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i));
2326     }
2327     PetscCall(PetscFree(r_waits2));
2328 
2329     /* Wait on sends1 and sends2 */
2330     PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
2331     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
2332     PetscCall(PetscFree(s_waits1));
2333     PetscCall(PetscFree(s_waits2));
2334 
2335     /* Now allocate sending buffers for a->j, and send them off */
2336     PetscCall(PetscMalloc1(nrqr,&sbuf_aj));
2337     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2338     if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0]));
2339     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2340 
2341     PetscCall(PetscMalloc1(nrqr,&s_waits3));
2342     {
2343       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2344       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2345       PetscInt cend = C->cmap->rend;
2346       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2347 
2348       for (i=0; i<nrqr; i++) {
2349         rbuf1_i   = rbuf1[i];
2350         sbuf_aj_i = sbuf_aj[i];
2351         ct1       = 2*rbuf1_i[0] + 1;
2352         ct2       = 0;
2353         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2354           kmax = rbuf1[i][2*j];
2355           for (k=0; k<kmax; k++,ct1++) {
2356             row    = rbuf1_i[ct1] - rstart;
2357             nzA    = a_i[row+1] - a_i[row];
2358             nzB    = b_i[row+1] - b_i[row];
2359             ncols  = nzA + nzB;
2360             cworkA = a_j + a_i[row];
2361             cworkB = b_j + b_i[row];
2362 
2363             /* load the column indices for this row into cols */
2364             cols = sbuf_aj_i + ct2;
2365 
2366             lwrite = 0;
2367             for (l=0; l<nzB; l++) {
2368               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2369             }
2370             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2371             for (l=0; l<nzB; l++) {
2372               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2373             }
2374 
2375             ct2 += ncols;
2376           }
2377         }
2378         PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i));
2379       }
2380     }
2381 
2382     /* create col map: global col of C -> local col of submatrices */
2383     {
2384       const PetscInt *icol_i;
2385 #if defined(PETSC_USE_CTABLE)
2386       for (i=0; i<ismax; i++) {
2387         if (!allcolumns[i]) {
2388           PetscCall(PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]));
2389 
2390           jmax   = ncol[i];
2391           icol_i = icol[i];
2392           cmap_i = cmap[i];
2393           for (j=0; j<jmax; j++) {
2394             PetscCall(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES));
2395           }
2396         } else cmap[i] = NULL;
2397       }
2398 #else
2399       for (i=0; i<ismax; i++) {
2400         if (!allcolumns[i]) {
2401           PetscCall(PetscCalloc1(C->cmap->N,&cmap[i]));
2402           jmax   = ncol[i];
2403           icol_i = icol[i];
2404           cmap_i = cmap[i];
2405           for (j=0; j<jmax; j++) {
2406             cmap_i[icol_i[j]] = j+1;
2407           }
2408         } else cmap[i] = NULL;
2409       }
2410 #endif
2411     }
2412 
2413     /* Create lens which is required for MatCreate... */
2414     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2415     PetscCall(PetscMalloc1(ismax,&lens));
2416 
2417     if (ismax) {
2418       PetscCall(PetscCalloc1(j,&lens[0]));
2419     }
2420     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2421 
2422     /* Update lens from local data */
2423     for (i=0; i<ismax; i++) {
2424       row2proc_i = row2proc[i];
2425       jmax = nrow[i];
2426       if (!allcolumns[i]) cmap_i = cmap[i];
2427       irow_i = irow[i];
2428       lens_i = lens[i];
2429       for (j=0; j<jmax; j++) {
2430         row = irow_i[j];
2431         proc = row2proc_i[j];
2432         if (proc == rank) {
2433           PetscCall(MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL));
2434           if (!allcolumns[i]) {
2435             for (k=0; k<ncols; k++) {
2436 #if defined(PETSC_USE_CTABLE)
2437               PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol));
2438 #else
2439               tcol = cmap_i[cols[k]];
2440 #endif
2441               if (tcol) lens_i[j]++;
2442             }
2443           } else { /* allcolumns */
2444             lens_i[j] = ncols;
2445           }
2446           PetscCall(MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL));
2447         }
2448       }
2449     }
2450 
2451     /* Create row map: global row of C -> local row of submatrices */
2452 #if defined(PETSC_USE_CTABLE)
2453     for (i=0; i<ismax; i++) {
2454       PetscCall(PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]));
2455       irow_i = irow[i];
2456       jmax   = nrow[i];
2457       for (j=0; j<jmax; j++) {
2458       PetscCall(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES));
2459       }
2460     }
2461 #else
2462     for (i=0; i<ismax; i++) {
2463       PetscCall(PetscCalloc1(C->rmap->N,&rmap[i]));
2464       rmap_i = rmap[i];
2465       irow_i = irow[i];
2466       jmax   = nrow[i];
2467       for (j=0; j<jmax; j++) {
2468         rmap_i[irow_i[j]] = j;
2469       }
2470     }
2471 #endif
2472 
2473     /* Update lens from offproc data */
2474     {
2475       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2476 
2477       PetscCallMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE));
2478       for (tmp2=0; tmp2<nrqs; tmp2++) {
2479         sbuf1_i = sbuf1[pa[tmp2]];
2480         jmax    = sbuf1_i[0];
2481         ct1     = 2*jmax+1;
2482         ct2     = 0;
2483         rbuf2_i = rbuf2[tmp2];
2484         rbuf3_i = rbuf3[tmp2];
2485         for (j=1; j<=jmax; j++) {
2486           is_no  = sbuf1_i[2*j-1];
2487           max1   = sbuf1_i[2*j];
2488           lens_i = lens[is_no];
2489           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2490           rmap_i = rmap[is_no];
2491           for (k=0; k<max1; k++,ct1++) {
2492 #if defined(PETSC_USE_CTABLE)
2493             PetscCall(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row));
2494             row--;
2495             PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2496 #else
2497             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2498 #endif
2499             max2 = rbuf2_i[ct1];
2500             for (l=0; l<max2; l++,ct2++) {
2501               if (!allcolumns[is_no]) {
2502 #if defined(PETSC_USE_CTABLE)
2503                 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
2504 #else
2505                 tcol = cmap_i[rbuf3_i[ct2]];
2506 #endif
2507                 if (tcol) lens_i[row]++;
2508               } else { /* allcolumns */
2509                 lens_i[row]++; /* lens_i[row] += max2 ? */
2510               }
2511             }
2512           }
2513         }
2514       }
2515     }
2516     PetscCall(PetscFree(r_waits3));
2517     PetscCallMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE));
2518     PetscCall(PetscFree(s_waits3));
2519 
2520     /* Create the submatrices */
2521     for (i=0; i<ismax; i++) {
2522       PetscInt    rbs,cbs;
2523 
2524       PetscCall(ISGetBlockSize(isrow[i],&rbs));
2525       PetscCall(ISGetBlockSize(iscol[i],&cbs));
2526 
2527       PetscCall(MatCreate(PETSC_COMM_SELF,submats+i));
2528       PetscCall(MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE));
2529 
2530       PetscCall(MatSetBlockSizes(submats[i],rbs,cbs));
2531       PetscCall(MatSetType(submats[i],((PetscObject)A)->type_name));
2532       PetscCall(MatSeqAIJSetPreallocation(submats[i],0,lens[i]));
2533 
2534       /* create struct Mat_SubSppt and attached it to submat */
2535       PetscCall(PetscNew(&smat_i));
2536       subc = (Mat_SeqAIJ*)submats[i]->data;
2537       subc->submatis1 = smat_i;
2538 
2539       smat_i->destroy          = submats[i]->ops->destroy;
2540       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2541       submats[i]->factortype   = C->factortype;
2542 
2543       smat_i->id          = i;
2544       smat_i->nrqs        = nrqs;
2545       smat_i->nrqr        = nrqr;
2546       smat_i->rbuf1       = rbuf1;
2547       smat_i->rbuf2       = rbuf2;
2548       smat_i->rbuf3       = rbuf3;
2549       smat_i->sbuf2       = sbuf2;
2550       smat_i->req_source2 = req_source2;
2551 
2552       smat_i->sbuf1       = sbuf1;
2553       smat_i->ptr         = ptr;
2554       smat_i->tmp         = tmp;
2555       smat_i->ctr         = ctr;
2556 
2557       smat_i->pa           = pa;
2558       smat_i->req_size     = req_size;
2559       smat_i->req_source1  = req_source1;
2560 
2561       smat_i->allcolumns  = allcolumns[i];
2562       smat_i->singleis    = PETSC_FALSE;
2563       smat_i->row2proc    = row2proc[i];
2564       smat_i->rmap        = rmap[i];
2565       smat_i->cmap        = cmap[i];
2566     }
2567 
2568     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2569       PetscCall(MatCreate(PETSC_COMM_SELF,&submats[0]));
2570       PetscCall(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE));
2571       PetscCall(MatSetType(submats[0],MATDUMMY));
2572 
2573       /* create struct Mat_SubSppt and attached it to submat */
2574       PetscCall(PetscNewLog(submats[0],&smat_i));
2575       submats[0]->data = (void*)smat_i;
2576 
2577       smat_i->destroy          = submats[0]->ops->destroy;
2578       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2579       submats[0]->factortype   = C->factortype;
2580 
2581       smat_i->id          = 0;
2582       smat_i->nrqs        = nrqs;
2583       smat_i->nrqr        = nrqr;
2584       smat_i->rbuf1       = rbuf1;
2585       smat_i->rbuf2       = rbuf2;
2586       smat_i->rbuf3       = rbuf3;
2587       smat_i->sbuf2       = sbuf2;
2588       smat_i->req_source2 = req_source2;
2589 
2590       smat_i->sbuf1       = sbuf1;
2591       smat_i->ptr         = ptr;
2592       smat_i->tmp         = tmp;
2593       smat_i->ctr         = ctr;
2594 
2595       smat_i->pa           = pa;
2596       smat_i->req_size     = req_size;
2597       smat_i->req_source1  = req_source1;
2598 
2599       smat_i->allcolumns  = PETSC_FALSE;
2600       smat_i->singleis    = PETSC_FALSE;
2601       smat_i->row2proc    = NULL;
2602       smat_i->rmap        = NULL;
2603       smat_i->cmap        = NULL;
2604     }
2605 
2606     if (ismax) PetscCall(PetscFree(lens[0]));
2607     PetscCall(PetscFree(lens));
2608     if (sbuf_aj) {
2609       PetscCall(PetscFree(sbuf_aj[0]));
2610       PetscCall(PetscFree(sbuf_aj));
2611     }
2612 
2613   } /* endof scall == MAT_INITIAL_MATRIX */
2614 
2615   /* Post recv matrix values */
2616   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4));
2617   PetscCall(PetscMalloc1(nrqs,&rbuf4));
2618   PetscCall(PetscMalloc1(nrqs,&r_waits4));
2619   for (i=0; i<nrqs; ++i) {
2620     PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i]));
2621     PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i));
2622   }
2623 
2624   /* Allocate sending buffers for a->a, and send them off */
2625   PetscCall(PetscMalloc1(nrqr,&sbuf_aa));
2626   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2627   if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0]));
2628   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2629 
2630   PetscCall(PetscMalloc1(nrqr,&s_waits4));
2631   {
2632     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2633     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2634     PetscInt    cend   = C->cmap->rend;
2635     PetscInt    *b_j   = b->j;
2636     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2637 
2638     PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a));
2639     PetscCall(MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a));
2640     for (i=0; i<nrqr; i++) {
2641       rbuf1_i   = rbuf1[i];
2642       sbuf_aa_i = sbuf_aa[i];
2643       ct1       = 2*rbuf1_i[0]+1;
2644       ct2       = 0;
2645       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2646         kmax = rbuf1_i[2*j];
2647         for (k=0; k<kmax; k++,ct1++) {
2648           row    = rbuf1_i[ct1] - rstart;
2649           nzA    = a_i[row+1] - a_i[row];
2650           nzB    = b_i[row+1] - b_i[row];
2651           ncols  = nzA + nzB;
2652           cworkB = b_j + b_i[row];
2653           vworkA = a_a + a_i[row];
2654           vworkB = b_a + b_i[row];
2655 
2656           /* load the column values for this row into vals*/
2657           vals = sbuf_aa_i+ct2;
2658 
2659           lwrite = 0;
2660           for (l=0; l<nzB; l++) {
2661             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2662           }
2663           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2664           for (l=0; l<nzB; l++) {
2665             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2666           }
2667 
2668           ct2 += ncols;
2669         }
2670       }
2671       PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i));
2672     }
2673     PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a));
2674     PetscCall(MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a));
2675   }
2676 
2677   /* Assemble the matrices */
2678   /* First assemble the local rows */
2679   for (i=0; i<ismax; i++) {
2680     row2proc_i = row2proc[i];
2681     subc      = (Mat_SeqAIJ*)submats[i]->data;
2682     imat_ilen = subc->ilen;
2683     imat_j    = subc->j;
2684     imat_i    = subc->i;
2685     imat_a    = subc->a;
2686 
2687     if (!allcolumns[i]) cmap_i = cmap[i];
2688     rmap_i = rmap[i];
2689     irow_i = irow[i];
2690     jmax   = nrow[i];
2691     for (j=0; j<jmax; j++) {
2692       row  = irow_i[j];
2693       proc = row2proc_i[j];
2694       if (proc == rank) {
2695         old_row = row;
2696 #if defined(PETSC_USE_CTABLE)
2697         PetscCall(PetscTableFind(rmap_i,row+1,&row));
2698         row--;
2699 #else
2700         row = rmap_i[row];
2701 #endif
2702         ilen_row = imat_ilen[row];
2703         PetscCall(MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals));
2704         mat_i    = imat_i[row];
2705         mat_a    = imat_a + mat_i;
2706         mat_j    = imat_j + mat_i;
2707         if (!allcolumns[i]) {
2708           for (k=0; k<ncols; k++) {
2709 #if defined(PETSC_USE_CTABLE)
2710             PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol));
2711 #else
2712             tcol = cmap_i[cols[k]];
2713 #endif
2714             if (tcol) {
2715               *mat_j++ = tcol - 1;
2716               *mat_a++ = vals[k];
2717               ilen_row++;
2718             }
2719           }
2720         } else { /* allcolumns */
2721           for (k=0; k<ncols; k++) {
2722             *mat_j++ = cols[k];  /* global col index! */
2723             *mat_a++ = vals[k];
2724             ilen_row++;
2725           }
2726         }
2727         PetscCall(MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals));
2728 
2729         imat_ilen[row] = ilen_row;
2730       }
2731     }
2732   }
2733 
2734   /* Now assemble the off proc rows */
2735   PetscCallMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE));
2736   for (tmp2=0; tmp2<nrqs; tmp2++) {
2737     sbuf1_i = sbuf1[pa[tmp2]];
2738     jmax    = sbuf1_i[0];
2739     ct1     = 2*jmax + 1;
2740     ct2     = 0;
2741     rbuf2_i = rbuf2[tmp2];
2742     rbuf3_i = rbuf3[tmp2];
2743     rbuf4_i = rbuf4[tmp2];
2744     for (j=1; j<=jmax; j++) {
2745       is_no     = sbuf1_i[2*j-1];
2746       rmap_i    = rmap[is_no];
2747       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2748       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2749       imat_ilen = subc->ilen;
2750       imat_j    = subc->j;
2751       imat_i    = subc->i;
2752       imat_a    = subc->a;
2753       max1      = sbuf1_i[2*j];
2754       for (k=0; k<max1; k++,ct1++) {
2755         row = sbuf1_i[ct1];
2756 #if defined(PETSC_USE_CTABLE)
2757         PetscCall(PetscTableFind(rmap_i,row+1,&row));
2758         row--;
2759 #else
2760         row = rmap_i[row];
2761 #endif
2762         ilen  = imat_ilen[row];
2763         mat_i = imat_i[row];
2764         mat_a = imat_a + mat_i;
2765         mat_j = imat_j + mat_i;
2766         max2  = rbuf2_i[ct1];
2767         if (!allcolumns[is_no]) {
2768           for (l=0; l<max2; l++,ct2++) {
2769 #if defined(PETSC_USE_CTABLE)
2770             PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
2771 #else
2772             tcol = cmap_i[rbuf3_i[ct2]];
2773 #endif
2774             if (tcol) {
2775               *mat_j++ = tcol - 1;
2776               *mat_a++ = rbuf4_i[ct2];
2777               ilen++;
2778             }
2779           }
2780         } else { /* allcolumns */
2781           for (l=0; l<max2; l++,ct2++) {
2782             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2783             *mat_a++ = rbuf4_i[ct2];
2784             ilen++;
2785           }
2786         }
2787         imat_ilen[row] = ilen;
2788       }
2789     }
2790   }
2791 
2792   if (!iscsorted) { /* sort column indices of the rows */
2793     for (i=0; i<ismax; i++) {
2794       subc      = (Mat_SeqAIJ*)submats[i]->data;
2795       imat_j    = subc->j;
2796       imat_i    = subc->i;
2797       imat_a    = subc->a;
2798       imat_ilen = subc->ilen;
2799 
2800       if (allcolumns[i]) continue;
2801       jmax = nrow[i];
2802       for (j=0; j<jmax; j++) {
2803         mat_i = imat_i[j];
2804         mat_a = imat_a + mat_i;
2805         mat_j = imat_j + mat_i;
2806         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a));
2807       }
2808     }
2809   }
2810 
2811   PetscCall(PetscFree(r_waits4));
2812   PetscCallMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE));
2813   PetscCall(PetscFree(s_waits4));
2814 
2815   /* Restore the indices */
2816   for (i=0; i<ismax; i++) {
2817     PetscCall(ISRestoreIndices(isrow[i],irow+i));
2818     if (!allcolumns[i]) {
2819       PetscCall(ISRestoreIndices(iscol[i],icol+i));
2820     }
2821   }
2822 
2823   for (i=0; i<ismax; i++) {
2824     PetscCall(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY));
2825     PetscCall(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY));
2826   }
2827 
2828   /* Destroy allocated memory */
2829   if (sbuf_aa) {
2830     PetscCall(PetscFree(sbuf_aa[0]));
2831     PetscCall(PetscFree(sbuf_aa));
2832   }
2833   PetscCall(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted));
2834 
2835   for (i=0; i<nrqs; ++i) {
2836     PetscCall(PetscFree(rbuf4[i]));
2837   }
2838   PetscCall(PetscFree(rbuf4));
2839 
2840   PetscCall(PetscFree4(row2proc,cmap,rmap,allcolumns));
2841   PetscFunctionReturn(0);
2842 }
2843 
2844 /*
2845  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2846  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2847  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2848  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2849  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2850  state, and needs to be "assembled" later by compressing B's column space.
2851 
2852  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2853  Following this call, C->A & C->B have been created, even if empty.
2854  */
2855 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2856 {
2857   /* If making this function public, change the error returned in this function away from _PLIB. */
2858   Mat_MPIAIJ     *aij;
2859   Mat_SeqAIJ     *Baij;
2860   PetscBool      seqaij,Bdisassembled;
2861   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2862   PetscScalar    v;
2863   const PetscInt *rowindices,*colindices;
2864 
2865   PetscFunctionBegin;
2866   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2867   if (A) {
2868     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij));
2869     PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2870     if (rowemb) {
2871       PetscCall(ISGetLocalSize(rowemb,&m));
2872       PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2873     } else {
2874       PetscCheck(C->rmap->n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2875     }
2876     if (dcolemb) {
2877       PetscCall(ISGetLocalSize(dcolemb,&n));
2878       PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n);
2879     } else {
2880       PetscCheck(C->cmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2881     }
2882   }
2883   if (B) {
2884     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij));
2885     PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2886     if (rowemb) {
2887       PetscCall(ISGetLocalSize(rowemb,&m));
2888       PetscCheck(m == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2889     } else {
2890       PetscCheck(C->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2891     }
2892     if (ocolemb) {
2893       PetscCall(ISGetLocalSize(ocolemb,&n));
2894       PetscCheck(n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n);
2895     } else {
2896       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2897     }
2898   }
2899 
2900   aij = (Mat_MPIAIJ*)C->data;
2901   if (!aij->A) {
2902     /* Mimic parts of MatMPIAIJSetPreallocation() */
2903     PetscCall(MatCreate(PETSC_COMM_SELF,&aij->A));
2904     PetscCall(MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n));
2905     PetscCall(MatSetBlockSizesFromMats(aij->A,C,C));
2906     PetscCall(MatSetType(aij->A,MATSEQAIJ));
2907     PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A));
2908   }
2909   if (A) {
2910     PetscCall(MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A));
2911   } else {
2912     PetscCall(MatSetUp(aij->A));
2913   }
2914   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2915     /*
2916       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2917       need to "disassemble" B -- convert it to using C's global indices.
2918       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2919 
2920       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2921       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2922 
2923       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2924       At least avoid calling MatSetValues() and the implied searches?
2925     */
2926 
2927     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2928 #if defined(PETSC_USE_CTABLE)
2929       PetscCall(PetscTableDestroy(&aij->colmap));
2930 #else
2931       PetscCall(PetscFree(aij->colmap));
2932       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2933       if (aij->B) PetscCall(PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt)));
2934 #endif
2935       ngcol = 0;
2936       if (aij->lvec) {
2937         PetscCall(VecGetSize(aij->lvec,&ngcol));
2938       }
2939       if (aij->garray) {
2940         PetscCall(PetscFree(aij->garray));
2941         PetscCall(PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt)));
2942       }
2943       PetscCall(VecDestroy(&aij->lvec));
2944       PetscCall(VecScatterDestroy(&aij->Mvctx));
2945     }
2946     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2947       PetscCall(MatDestroy(&aij->B));
2948     }
2949     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2950       PetscCall(MatZeroEntries(aij->B));
2951     }
2952   }
2953   Bdisassembled = PETSC_FALSE;
2954   if (!aij->B) {
2955     PetscCall(MatCreate(PETSC_COMM_SELF,&aij->B));
2956     PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B));
2957     PetscCall(MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N));
2958     PetscCall(MatSetBlockSizesFromMats(aij->B,B,B));
2959     PetscCall(MatSetType(aij->B,MATSEQAIJ));
2960     Bdisassembled = PETSC_TRUE;
2961   }
2962   if (B) {
2963     Baij = (Mat_SeqAIJ*)B->data;
2964     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2965       PetscCall(PetscMalloc1(B->rmap->n,&nz));
2966       for (i=0; i<B->rmap->n; i++) {
2967         nz[i] = Baij->i[i+1] - Baij->i[i];
2968       }
2969       PetscCall(MatSeqAIJSetPreallocation(aij->B,0,nz));
2970       PetscCall(PetscFree(nz));
2971     }
2972 
2973     PetscCall(PetscLayoutGetRange(C->rmap,&rstart,&rend));
2974     shift = rend-rstart;
2975     count = 0;
2976     rowindices = NULL;
2977     colindices = NULL;
2978     if (rowemb) {
2979       PetscCall(ISGetIndices(rowemb,&rowindices));
2980     }
2981     if (ocolemb) {
2982       PetscCall(ISGetIndices(ocolemb,&colindices));
2983     }
2984     for (i=0; i<B->rmap->n; i++) {
2985       PetscInt row;
2986       row = i;
2987       if (rowindices) row = rowindices[i];
2988       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2989         col  = Baij->j[count];
2990         if (colindices) col = colindices[col];
2991         if (Bdisassembled && col>=rstart) col += shift;
2992         v    = Baij->a[count];
2993         PetscCall(MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES));
2994         ++count;
2995       }
2996     }
2997     /* No assembly for aij->B is necessary. */
2998     /* FIXME: set aij->B's nonzerostate correctly. */
2999   } else {
3000     PetscCall(MatSetUp(aij->B));
3001   }
3002   C->preallocated  = PETSC_TRUE;
3003   C->was_assembled = PETSC_FALSE;
3004   C->assembled     = PETSC_FALSE;
3005    /*
3006       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3007       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3008    */
3009   PetscFunctionReturn(0);
3010 }
3011 
3012 /*
3013   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3014  */
3015 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3016 {
3017   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3018 
3019   PetscFunctionBegin;
3020   PetscValidPointer(A,2);
3021   PetscValidPointer(B,3);
3022   /* FIXME: make sure C is assembled */
3023   *A = aij->A;
3024   *B = aij->B;
3025   /* Note that we don't incref *A and *B, so be careful! */
3026   PetscFunctionReturn(0);
3027 }
3028 
3029 /*
3030   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3031   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3032 */
3033 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3034                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3035                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3036                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3037                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3038 {
3039   PetscMPIInt    size,flag;
3040   PetscInt       i,ii,cismax,ispar;
3041   Mat            *A,*B;
3042   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3043 
3044   PetscFunctionBegin;
3045   if (!ismax) PetscFunctionReturn(0);
3046 
3047   for (i = 0, cismax = 0; i < ismax; ++i) {
3048     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag));
3049     PetscCheck(flag == MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3050     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3051     if (size > 1) ++cismax;
3052   }
3053 
3054   /*
3055      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3056      ispar counts the number of parallel ISs across C's comm.
3057   */
3058   PetscCall(MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
3059   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3060     PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat));
3061     PetscFunctionReturn(0);
3062   }
3063 
3064   /* if (ispar) */
3065   /*
3066     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3067     These are used to extract the off-diag portion of the resulting parallel matrix.
3068     The row IS for the off-diag portion is the same as for the diag portion,
3069     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3070   */
3071   PetscCall(PetscMalloc2(cismax,&cisrow,cismax,&ciscol));
3072   PetscCall(PetscMalloc1(cismax,&ciscol_p));
3073   for (i = 0, ii = 0; i < ismax; ++i) {
3074     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3075     if (size > 1) {
3076       /*
3077          TODO: This is the part that's ***NOT SCALABLE***.
3078          To fix this we need to extract just the indices of C's nonzero columns
3079          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3080          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3081          to be done without serializing on the IS list, so, most likely, it is best
3082          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3083       */
3084       PetscCall(ISGetNonlocalIS(iscol[i],&(ciscol[ii])));
3085       /* Now we have to
3086          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3087              were sorted on each rank, concatenated they might no longer be sorted;
3088          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3089              indices in the nondecreasing order to the original index positions.
3090          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3091       */
3092       PetscCall(ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii));
3093       PetscCall(ISSort(ciscol[ii]));
3094       ++ii;
3095     }
3096   }
3097   PetscCall(PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p));
3098   for (i = 0, ii = 0; i < ismax; ++i) {
3099     PetscInt       j,issize;
3100     const PetscInt *indices;
3101 
3102     /*
3103        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3104      */
3105     PetscCall(ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i));
3106     PetscCall(ISSort(isrow[i]));
3107     PetscCall(ISGetLocalSize(isrow[i],&issize));
3108     PetscCall(ISGetIndices(isrow[i],&indices));
3109     for (j = 1; j < issize; ++j) {
3110       PetscCheck(indices[j] != indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3111     }
3112     PetscCall(ISRestoreIndices(isrow[i],&indices));
3113     PetscCall(ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i));
3114     PetscCall(ISSort(iscol[i]));
3115     PetscCall(ISGetLocalSize(iscol[i],&issize));
3116     PetscCall(ISGetIndices(iscol[i],&indices));
3117     for (j = 1; j < issize; ++j) {
3118       PetscCheck(indices[j-1] != indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3119     }
3120     PetscCall(ISRestoreIndices(iscol[i],&indices));
3121     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3122     if (size > 1) {
3123       cisrow[ii] = isrow[i];
3124       ++ii;
3125     }
3126   }
3127   /*
3128     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3129     array of sequential matrices underlying the resulting parallel matrices.
3130     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3131     contain duplicates.
3132 
3133     There are as many diag matrices as there are original index sets. There are only as many parallel
3134     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3135 
3136     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3137     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3138       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3139       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3140       with A[i] and B[ii] extracted from the corresponding MPI submat.
3141     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3142       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3143       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3144       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3145       values into A[i] and B[ii] sitting inside the corresponding submat.
3146     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3147       A[i], B[ii], AA[i] or BB[ii] matrices.
3148   */
3149   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3150   if (scall == MAT_INITIAL_MATRIX) {
3151     PetscCall(PetscMalloc1(ismax,submat));
3152   }
3153 
3154   /* Now obtain the sequential A and B submatrices separately. */
3155   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3156   PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A));
3157   PetscCall((*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B));
3158 
3159   /*
3160     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3161     matrices A & B have been extracted directly into the parallel matrices containing them, or
3162     simply into the sequential matrix identical with the corresponding A (if size == 1).
3163     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3164     to have the same sparsity pattern.
3165     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3166     must be constructed for C. This is done by setseqmat(s).
3167   */
3168   for (i = 0, ii = 0; i < ismax; ++i) {
3169     /*
3170        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3171        That way we can avoid sorting and computing permutations when reusing.
3172        To this end:
3173         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3174         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3175           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3176     */
3177     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3178 
3179     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3180     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3181     if (size > 1) {
3182       if (scall == MAT_INITIAL_MATRIX) {
3183         PetscCall(MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i));
3184         PetscCall(MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE));
3185         PetscCall(MatSetType((*submat)[i],MATMPIAIJ));
3186         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3187         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3188       }
3189       /*
3190         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3191       */
3192       {
3193         Mat AA = A[i],BB = B[ii];
3194 
3195         if (AA || BB) {
3196           PetscCall(setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB));
3197           PetscCall(MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY));
3198           PetscCall(MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY));
3199         }
3200         PetscCall(MatDestroy(&AA));
3201       }
3202       PetscCall(ISDestroy(ciscol+ii));
3203       PetscCall(ISDestroy(ciscol_p+ii));
3204       ++ii;
3205     } else { /* if (size == 1) */
3206       if (scall == MAT_REUSE_MATRIX) {
3207         PetscCall(MatDestroy(&(*submat)[i]));
3208       }
3209       if (isrow_p[i] || iscol_p[i]) {
3210         PetscCall(MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i));
3211         PetscCall(setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]));
3212         /* Otherwise A is extracted straight into (*submats)[i]. */
3213         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3214         PetscCall(MatDestroy(A+i));
3215       } else (*submat)[i] = A[i];
3216     }
3217     PetscCall(ISDestroy(&isrow_p[i]));
3218     PetscCall(ISDestroy(&iscol_p[i]));
3219   }
3220   PetscCall(PetscFree2(cisrow,ciscol));
3221   PetscCall(PetscFree2(isrow_p,iscol_p));
3222   PetscCall(PetscFree(ciscol_p));
3223   PetscCall(PetscFree(A));
3224   PetscCall(MatDestroySubMatrices(cismax,&B));
3225   PetscFunctionReturn(0);
3226 }
3227 
3228 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3229 {
3230   PetscFunctionBegin;
3231   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ));
3232   PetscFunctionReturn(0);
3233 }
3234