xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 0e02354e4efb6ae7920bb0e7d191735ccbbd1e1e)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscInt       i;
24 
25   PetscFunctionBegin;
26   PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
27   for (i=0; i<ov; ++i) {
28     PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C,imax,is));
29   }
30   PetscFunctionReturn(0);
31 }
32 
33 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
34 {
35   PetscInt       i;
36 
37   PetscFunctionBegin;
38   PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
39   for (i=0; i<ov; ++i) {
40     PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is));
41   }
42   PetscFunctionReturn(0);
43 }
44 
45 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
46 {
47   MPI_Comm       comm;
48   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
49   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
50   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
51   const PetscInt *indices_i,**indices;
52   PetscLayout    rmap;
53   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
54   PetscSF        sf;
55   PetscSFNode    *remote;
56 
57   PetscFunctionBegin;
58   PetscCall(PetscObjectGetComm((PetscObject)mat,&comm));
59   PetscCallMPI(MPI_Comm_rank(comm,&rank));
60   PetscCallMPI(MPI_Comm_size(comm,&size));
61   /* get row map to determine where rows should be going */
62   PetscCall(MatGetLayouts(mat,&rmap,NULL));
63   /* retrieve IS data and put all together so that we
64    * can optimize communication
65    *  */
66   PetscCall(PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length));
67   for (i=0,tlength=0; i<nidx; i++) {
68     PetscCall(ISGetLocalSize(is[i],&length[i]));
69     tlength += length[i];
70     PetscCall(ISGetIndices(is[i],&indices[i]));
71   }
72   /* find these rows on remote processors */
73   PetscCall(PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids));
74   PetscCall(PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp));
75   nrrows = 0;
76   for (i=0; i<nidx; i++) {
77     length_i  = length[i];
78     indices_i = indices[i];
79     for (j=0; j<length_i; j++) {
80       owner = -1;
81       PetscCall(PetscLayoutFindOwner(rmap,indices_i[j],&owner));
82       /* remote processors */
83       if (owner != rank) {
84         tosizes_temp[owner]++; /* number of rows to owner */
85         rrow_ranks[nrrows]  = owner; /* processor */
86         rrow_isids[nrrows]   = i; /* is id */
87         remoterows[nrrows++] = indices_i[j]; /* row */
88       }
89     }
90     PetscCall(ISRestoreIndices(is[i],&indices[i]));
91   }
92   PetscCall(PetscFree2(*(PetscInt***)&indices,length));
93   /* test if we need to exchange messages
94    * generally speaking, we do not need to exchange
95    * data when overlap is 1
96    * */
97   PetscCall(MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm));
98   /* we do not have any messages
99    * It usually corresponds to overlap 1
100    * */
101   if (!reducednrrows) {
102     PetscCall(PetscFree3(toranks,tosizes,tosizes_temp));
103     PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids));
104     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is));
105     PetscFunctionReturn(0);
106   }
107   nto = 0;
108   /* send sizes and ranks for building a two-sided communcation */
109   for (i=0; i<size; i++) {
110     if (tosizes_temp[i]) {
111       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
112       tosizes_temp[i] = nto; /* a map from processor to index */
113       toranks[nto++]  = i; /* processor */
114     }
115   }
116   PetscCall(PetscMalloc1(nto+1,&toffsets));
117   toffsets[0] = 0;
118   for (i=0; i<nto; i++) {
119     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
120     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
121   }
122   /* send information to other processors */
123   PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes));
124   nrecvrows = 0;
125   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
126   PetscCall(PetscMalloc1(nrecvrows,&remote));
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) {
129     for (j=0; j<fromsizes[2*i]; j++) {
130       remote[nrecvrows].rank    = fromranks[i];
131       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
132     }
133   }
134   PetscCall(PetscSFCreate(comm,&sf));
135   PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER));
136   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
137   PetscCall(PetscSFSetType(sf,PETSCSFBASIC));
138   PetscCall(PetscSFSetFromOptions(sf));
139   /* message pair <no of is, row>  */
140   PetscCall(PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata));
141   for (i=0; i<nrrows; i++) {
142     owner = rrow_ranks[i]; /* processor */
143     j     = tosizes_temp[owner]; /* index */
144     todata[toffsets[j]++] = rrow_isids[i];
145     todata[toffsets[j]++] = remoterows[i];
146   }
147   PetscCall(PetscFree3(toranks,tosizes,tosizes_temp));
148   PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids));
149   PetscCall(PetscFree(toffsets));
150   PetscCall(PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE));
151   PetscCall(PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE));
152   PetscCall(PetscSFDestroy(&sf));
153   /* send rows belonging to the remote so that then we could get the overlapping data back */
154   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata));
155   PetscCall(PetscFree2(todata,fromdata));
156   PetscCall(PetscFree(fromsizes));
157   PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes));
158   PetscCall(PetscFree(fromranks));
159   nrecvrows = 0;
160   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
161   PetscCall(PetscCalloc1(nrecvrows,&todata));
162   PetscCall(PetscMalloc1(nrecvrows,&remote));
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) {
165     for (j=0; j<tosizes[2*i]; j++) {
166       remote[nrecvrows].rank    = toranks[i];
167       remote[nrecvrows++].index = tosizes[2*i+1]+j;
168     }
169   }
170   PetscCall(PetscSFCreate(comm,&sf));
171   PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER));
172   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
173   PetscCall(PetscSFSetType(sf,PETSCSFBASIC));
174   PetscCall(PetscSFSetFromOptions(sf));
175   /* overlap communication and computation */
176   PetscCall(PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE));
177   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is));
178   PetscCall(PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE));
179   PetscCall(PetscSFDestroy(&sf));
180   PetscCall(PetscFree2(sbdata,sbsizes));
181   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata));
182   PetscCall(PetscFree(toranks));
183   PetscCall(PetscFree(tosizes));
184   PetscCall(PetscFree(todata));
185   PetscFunctionReturn(0);
186 }
187 
188 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
189 {
190   PetscInt         *isz,isz_i,i,j,is_id, data_size;
191   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
192   const PetscInt   *indices_i_temp;
193   MPI_Comm         *iscomms;
194 
195   PetscFunctionBegin;
196   max_lsize = 0;
197   PetscCall(PetscMalloc1(nidx,&isz));
198   for (i=0; i<nidx; i++) {
199     PetscCall(ISGetLocalSize(is[i],&lsize));
200     max_lsize = lsize>max_lsize ? lsize:max_lsize;
201     isz[i]    = lsize;
202   }
203   PetscCall(PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms));
204   for (i=0; i<nidx; i++) {
205     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL));
206     PetscCall(ISGetIndices(is[i],&indices_i_temp));
207     PetscCall(PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]));
208     PetscCall(ISRestoreIndices(is[i],&indices_i_temp));
209     PetscCall(ISDestroy(&is[i]));
210   }
211   /* retrieve information to get row id and its overlap */
212   for (i=0; i<nrecvs;) {
213     is_id     = recvdata[i++];
214     data_size = recvdata[i++];
215     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
216     isz_i     = isz[is_id];
217     for (j=0; j< data_size; j++) {
218       col = recvdata[i++];
219       indices_i[isz_i++] = col;
220     }
221     isz[is_id] = isz_i;
222   }
223   /* remove duplicate entities */
224   for (i=0; i<nidx; i++) {
225     indices_i = indices_temp+(max_lsize+nrecvs)*i;
226     isz_i     = isz[i];
227     PetscCall(PetscSortRemoveDupsInt(&isz_i,indices_i));
228     PetscCall(ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]));
229     PetscCall(PetscCommDestroy(&iscomms[i]));
230   }
231   PetscCall(PetscFree(isz));
232   PetscCall(PetscFree2(indices_temp,iscomms));
233   PetscFunctionReturn(0);
234 }
235 
236 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
237 {
238   PetscLayout       rmap,cmap;
239   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
240   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
241   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
242   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
243   Mat               amat,bmat;
244   PetscMPIInt       rank;
245   PetscBool         done;
246   MPI_Comm          comm;
247 
248   PetscFunctionBegin;
249   PetscCall(PetscObjectGetComm((PetscObject)mat,&comm));
250   PetscCallMPI(MPI_Comm_rank(comm,&rank));
251   PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols));
252   /* Even if the mat is symmetric, we still assume it is not symmetric */
253   PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
254   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
255   PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
256   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
257   /* total number of nonzero values is used to estimate the memory usage in the next step */
258   tnz  = ai[an]+bi[bn];
259   PetscCall(MatGetLayouts(mat,&rmap,&cmap));
260   PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL));
261   PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL));
262   /* to find the longest message */
263   max_fszs = 0;
264   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
265   /* better way to estimate number of nonzero in the mat??? */
266   PetscCall(PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp));
267   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
268   rows_pos  = 0;
269   totalrows = 0;
270   for (i=0; i<nfrom; i++) {
271     PetscCall(PetscArrayzero(rows_pos_i,nidx));
272     /* group data together */
273     for (j=0; j<fromsizes[2*i]; j+=2) {
274       is_id                       = fromrows[rows_pos++];/* no of is */
275       rows_i                      = rows_data[is_id];
276       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
277     }
278     /* estimate a space to avoid multiple allocations  */
279     for (j=0; j<nidx; j++) {
280       indvc_ij = 0;
281       rows_i   = rows_data[j];
282       for (l=0; l<rows_pos_i[j]; l++) {
283         row    = rows_i[l]-rstart;
284         start  = ai[row];
285         end    = ai[row+1];
286         for (k=start; k<end; k++) { /* Amat */
287           col = aj[k] + cstart;
288           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
289         }
290         start = bi[row];
291         end   = bi[row+1];
292         for (k=start; k<end; k++) { /* Bmat */
293           col = gcols[bj[k]];
294           indices_tmp[indvc_ij++] = col;
295         }
296       }
297       PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp));
298       indv_counts[i*nidx+j] = indvc_ij;
299       totalrows            += indvc_ij;
300     }
301   }
302   /* message triple <no of is, number of rows, rows> */
303   PetscCall(PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes));
304   totalrows = 0;
305   rows_pos  = 0;
306   /* use this code again */
307   for (i=0;i<nfrom;i++) {
308     PetscCall(PetscArrayzero(rows_pos_i,nidx));
309     for (j=0; j<fromsizes[2*i]; j+=2) {
310       is_id                       = fromrows[rows_pos++];
311       rows_i                      = rows_data[is_id];
312       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
313     }
314     /* add data  */
315     for (j=0; j<nidx; j++) {
316       if (!indv_counts[i*nidx+j]) continue;
317       indvc_ij = 0;
318       sbdata[totalrows++] = j;
319       sbdata[totalrows++] = indv_counts[i*nidx+j];
320       sbsizes[2*i]       += 2;
321       rows_i              = rows_data[j];
322       for (l=0; l<rows_pos_i[j]; l++) {
323         row   = rows_i[l]-rstart;
324         start = ai[row];
325         end   = ai[row+1];
326         for (k=start; k<end; k++) { /* Amat */
327           col = aj[k] + cstart;
328           indices_tmp[indvc_ij++] = col;
329         }
330         start = bi[row];
331         end   = bi[row+1];
332         for (k=start; k<end; k++) { /* Bmat */
333           col = gcols[bj[k]];
334           indices_tmp[indvc_ij++] = col;
335         }
336       }
337       PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp));
338       sbsizes[2*i]  += indvc_ij;
339       PetscCall(PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij));
340       totalrows += indvc_ij;
341     }
342   }
343   PetscCall(PetscMalloc1(nfrom+1,&offsets));
344   offsets[0] = 0;
345   for (i=0; i<nfrom; i++) {
346     offsets[i+1]   = offsets[i] + sbsizes[2*i];
347     sbsizes[2*i+1] = offsets[i];
348   }
349   PetscCall(PetscFree(offsets));
350   if (sbrowsizes) *sbrowsizes = sbsizes;
351   if (sbrows) *sbrows = sbdata;
352   PetscCall(PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp));
353   PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
354   PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
355   PetscFunctionReturn(0);
356 }
357 
358 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
359 {
360   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
361   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
362   PetscInt          lsize,lsize_tmp;
363   PetscMPIInt       rank,owner;
364   Mat               amat,bmat;
365   PetscBool         done;
366   PetscLayout       cmap,rmap;
367   MPI_Comm          comm;
368 
369   PetscFunctionBegin;
370   PetscCall(PetscObjectGetComm((PetscObject)mat,&comm));
371   PetscCallMPI(MPI_Comm_rank(comm,&rank));
372   PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols));
373   PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
374   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
375   PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
376   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
377   /* is it a safe way to compute number of nonzero values ? */
378   tnz  = ai[an]+bi[bn];
379   PetscCall(MatGetLayouts(mat,&rmap,&cmap));
380   PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL));
381   PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL));
382   /* it is a better way to estimate memory than the old implementation
383    * where global size of matrix is used
384    * */
385   PetscCall(PetscMalloc1(tnz,&indices_temp));
386   for (i=0; i<nidx; i++) {
387     MPI_Comm iscomm;
388 
389     PetscCall(ISGetLocalSize(is[i],&lsize));
390     PetscCall(ISGetIndices(is[i],&indices));
391     lsize_tmp = 0;
392     for (j=0; j<lsize; j++) {
393       owner = -1;
394       row   = indices[j];
395       PetscCall(PetscLayoutFindOwner(rmap,row,&owner));
396       if (owner != rank) continue;
397       /* local number */
398       row  -= rstart;
399       start = ai[row];
400       end   = ai[row+1];
401       for (k=start; k<end; k++) { /* Amat */
402         col = aj[k] + cstart;
403         indices_temp[lsize_tmp++] = col;
404       }
405       start = bi[row];
406       end   = bi[row+1];
407       for (k=start; k<end; k++) { /* Bmat */
408         col = gcols[bj[k]];
409         indices_temp[lsize_tmp++] = col;
410       }
411     }
412    PetscCall(ISRestoreIndices(is[i],&indices));
413    PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL));
414    PetscCall(ISDestroy(&is[i]));
415    PetscCall(PetscSortRemoveDupsInt(&lsize_tmp,indices_temp));
416    PetscCall(ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]));
417    PetscCall(PetscCommDestroy(&iscomm));
418   }
419   PetscCall(PetscFree(indices_temp));
420   PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
421   PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
422   PetscFunctionReturn(0);
423 }
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests received (which have to be or which have been processed)
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
461   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
462   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
463   PetscBT        *table;
464   MPI_Comm       comm;
465   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
466   MPI_Status     *recv_status;
467   MPI_Comm       *iscomms;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag1));
477   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
478 
479   PetscCall(PetscMalloc2(imax,(PetscInt***)&idx,imax,&n));
480 
481   for (i=0; i<imax; i++) {
482     PetscCall(ISGetIndices(is[i],&idx[i]));
483     PetscCall(ISGetLocalSize(is[i],&n[i]));
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));
489   for (i=0; i<imax; i++) {
490     PetscCall(PetscArrayzero(w4,size)); /* initialise work vector*/
491     idx_i = idx[i];
492     len   = n[i];
493     for (j=0; j<len; j++) {
494       row = idx_i[j];
495       PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
496       PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc));
497       w4[proc]++;
498     }
499     for (j=0; j<size; j++) {
500       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
501     }
502   }
503 
504   nrqs     = 0;              /* no of outgoing messages */
505   msz      = 0;              /* total mesg length (for all proc */
506   w1[rank] = 0;              /* no mesg sent to intself */
507   w3[rank] = 0;
508   for (i=0; i<size; i++) {
509     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
510   }
511   /* pa - is list of processors to communicate with */
512   PetscCall(PetscMalloc1(nrqs,&pa));
513   for (i=0,j=0; i<size; i++) {
514     if (w1[i]) {pa[j] = i; j++;}
515   }
516 
517   /* Each message would have a header = 1 + 2*(no of IS) + data */
518   for (i=0; i<nrqs; i++) {
519     j      = pa[i];
520     w1[j] += w2[j] + 2*w3[j];
521     msz   += w1[j];
522   }
523 
524   /* Determine the number of messages to expect, their lengths, from from-ids */
525   PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
526   PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
527 
528   /* Now post the Irecvs corresponding to these messages */
529   PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1));
530 
531   /* Allocate Memory for outgoing messages */
532   PetscCall(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr));
533   PetscCall(PetscArrayzero(outdat,size));
534   PetscCall(PetscArrayzero(ptr,size));
535 
536   {
537     PetscInt *iptr = tmp,ict  = 0;
538     for (i=0; i<nrqs; i++) {
539       j         = pa[i];
540       iptr     +=  ict;
541       outdat[j] = iptr;
542       ict       = w1[j];
543     }
544   }
545 
546   /* Form the outgoing messages */
547   /* plug in the headers */
548   for (i=0; i<nrqs; i++) {
549     j            = pa[i];
550     outdat[j][0] = 0;
551     PetscCall(PetscArrayzero(outdat[j]+1,2*w3[j]));
552     ptr[j]       = outdat[j] + 2*w3[j] + 1;
553   }
554 
555   /* Memory for doing local proc's work */
556   {
557     PetscInt M_BPB_imax = 0;
558 #if defined(PETSC_USE_CTABLE)
559     PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax));
560     PetscCall(PetscMalloc1(imax,&table_data));
561     for (i=0; i<imax; i++) {
562       PetscCall(PetscTableCreate(n[i],M,&table_data[i]));
563     }
564     PetscCall(PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p));
565     for (i=0; i<imax; i++) {
566       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
567     }
568 #else
569     PetscInt Mimax = 0;
570     PetscCall(PetscIntMultError(M,imax, &Mimax));
571     PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax));
572     PetscCall(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p));
573     for (i=0; i<imax; i++) {
574       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
575       data[i]  = d_p + M*i;
576     }
577 #endif
578   }
579 
580   /* Parse the IS and update local tables and the outgoing buf with the data */
581   {
582     PetscInt n_i,isz_i,*outdat_j,ctr_j;
583     PetscBT  table_i;
584 
585     for (i=0; i<imax; i++) {
586       PetscCall(PetscArrayzero(ctr,size));
587       n_i     = n[i];
588       table_i = table[i];
589       idx_i   = idx[i];
590 #if defined(PETSC_USE_CTABLE)
591       table_data_i = table_data[i];
592 #else
593       data_i  = data[i];
594 #endif
595       isz_i   = isz[i];
596       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
597         row  = idx_i[j];
598         PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc));
599         if (proc != rank) { /* copy to the outgoing buffer */
600           ctr[proc]++;
601           *ptr[proc] = row;
602           ptr[proc]++;
603         } else if (!PetscBTLookupSet(table_i,row)) {
604 #if defined(PETSC_USE_CTABLE)
605           PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES));
606 #else
607           data_i[isz_i] = row; /* Update the local table */
608 #endif
609           isz_i++;
610         }
611       }
612       /* Update the headers for the current IS */
613       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
614         if ((ctr_j = ctr[j])) {
615           outdat_j        = outdat[j];
616           k               = ++outdat_j[0];
617           outdat_j[2*k]   = ctr_j;
618           outdat_j[2*k-1] = i;
619         }
620       }
621       isz[i] = isz_i;
622     }
623   }
624 
625   /*  Now  post the sends */
626   PetscCall(PetscMalloc1(nrqs,&s_waits1));
627   for (i=0; i<nrqs; ++i) {
628     j    = pa[i];
629     PetscCallMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i));
630   }
631 
632   /* No longer need the original indices */
633   PetscCall(PetscMalloc1(imax,&iscomms));
634   for (i=0; i<imax; ++i) {
635     PetscCall(ISRestoreIndices(is[i],idx+i));
636     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL));
637   }
638   PetscCall(PetscFree2(*(PetscInt***)&idx,n));
639 
640   for (i=0; i<imax; ++i) {
641     PetscCall(ISDestroy(&is[i]));
642   }
643 
644   /* Do Local work */
645 #if defined(PETSC_USE_CTABLE)
646   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data));
647 #else
648   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL));
649 #endif
650 
651   /* Receive messages */
652   PetscCall(PetscMalloc1(nrqr,&recv_status));
653   PetscCallMPI(MPI_Waitall(nrqr,r_waits1,recv_status));
654   PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
655 
656   /* Phase 1 sends are complete - deallocate buffers */
657   PetscCall(PetscFree4(outdat,ptr,tmp,ctr));
658   PetscCall(PetscFree4(w1,w2,w3,w4));
659 
660   PetscCall(PetscMalloc1(nrqr,&xdata));
661   PetscCall(PetscMalloc1(nrqr,&isz1));
662   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1));
663   PetscCall(PetscFree(rbuf[0]));
664   PetscCall(PetscFree(rbuf));
665 
666   /* Send the data back */
667   /* Do a global reduction to know the buffer space req for incoming messages */
668   {
669     PetscMPIInt *rw1;
670 
671     PetscCall(PetscCalloc1(size,&rw1));
672 
673     for (i=0; i<nrqr; ++i) {
674       proc = recv_status[i].MPI_SOURCE;
675 
676       PetscCheck(proc == onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
677       rw1[proc] = isz1[i];
678     }
679     PetscCall(PetscFree(onodes1));
680     PetscCall(PetscFree(olengths1));
681 
682     /* Determine the number of messages to expect, their lengths, from from-ids */
683     PetscCall(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2));
684     PetscCall(PetscFree(rw1));
685   }
686   /* Now post the Irecvs corresponding to these messages */
687   PetscCall(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2));
688 
689   /* Now  post the sends */
690   PetscCall(PetscMalloc1(nrqr,&s_waits2));
691   for (i=0; i<nrqr; ++i) {
692     j    = recv_status[i].MPI_SOURCE;
693     PetscCallMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i));
694   }
695 
696   /* receive work done on other processors */
697   {
698     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
699     PetscMPIInt idex;
700     PetscBT     table_i;
701 
702     for (i=0; i<nrqs; ++i) {
703       PetscCallMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE));
704       /* Process the message */
705       rbuf2_i = rbuf2[idex];
706       ct1     = 2*rbuf2_i[0]+1;
707       jmax    = rbuf2[idex][0];
708       for (j=1; j<=jmax; j++) {
709         max     = rbuf2_i[2*j];
710         is_no   = rbuf2_i[2*j-1];
711         isz_i   = isz[is_no];
712         table_i = table[is_no];
713 #if defined(PETSC_USE_CTABLE)
714         table_data_i = table_data[is_no];
715 #else
716         data_i  = data[is_no];
717 #endif
718         for (k=0; k<max; k++,ct1++) {
719           row = rbuf2_i[ct1];
720           if (!PetscBTLookupSet(table_i,row)) {
721 #if defined(PETSC_USE_CTABLE)
722             PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES));
723 #else
724             data_i[isz_i] = row;
725 #endif
726             isz_i++;
727           }
728         }
729         isz[is_no] = isz_i;
730       }
731     }
732 
733     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
734   }
735 
736 #if defined(PETSC_USE_CTABLE)
737   tcount_max = 0;
738   for (i=0; i<imax; ++i) {
739     table_data_i = table_data[i];
740     PetscCall(PetscTableGetCount(table_data_i,&tcount));
741     if (tcount_max < tcount) tcount_max = tcount;
742   }
743   PetscCall(PetscMalloc1(tcount_max+1,&tdata));
744 #endif
745 
746   for (i=0; i<imax; ++i) {
747 #if defined(PETSC_USE_CTABLE)
748     PetscTablePosition tpos;
749     table_data_i = table_data[i];
750 
751     PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos));
752     while (tpos) {
753       PetscCall(PetscTableGetNext(table_data_i,&tpos,&k,&j));
754       tdata[--j] = --k;
755     }
756     PetscCall(ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i));
757 #else
758     PetscCall(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i));
759 #endif
760     PetscCall(PetscCommDestroy(&iscomms[i]));
761   }
762 
763   PetscCall(PetscFree(iscomms));
764   PetscCall(PetscFree(onodes2));
765   PetscCall(PetscFree(olengths2));
766 
767   PetscCall(PetscFree(pa));
768   PetscCall(PetscFree(rbuf2[0]));
769   PetscCall(PetscFree(rbuf2));
770   PetscCall(PetscFree(s_waits1));
771   PetscCall(PetscFree(r_waits1));
772   PetscCall(PetscFree(s_waits2));
773   PetscCall(PetscFree(r_waits2));
774   PetscCall(PetscFree(recv_status));
775   if (xdata) {
776     PetscCall(PetscFree(xdata[0]));
777     PetscCall(PetscFree(xdata));
778   }
779   PetscCall(PetscFree(isz1));
780 #if defined(PETSC_USE_CTABLE)
781   for (i=0; i<imax; i++) {
782     PetscCall(PetscTableDestroy((PetscTable*)&table_data[i]));
783   }
784   PetscCall(PetscFree(table_data));
785   PetscCall(PetscFree(tdata));
786   PetscCall(PetscFree4(table,data,isz,t_p));
787 #else
788   PetscCall(PetscFree5(table,data,isz,d_p,t_p));
789 #endif
790   PetscFunctionReturn(0);
791 }
792 
793 /*
794    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
795        the work on the local processor.
796 
797      Inputs:
798       C      - MAT_MPIAIJ;
799       imax - total no of index sets processed at a time;
800       table  - an array of char - size = m bits.
801 
802      Output:
803       isz    - array containing the count of the solution elements corresponding
804                to each index set;
805       data or table_data  - pointer to the solutions
806 */
807 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
808 {
809   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
810   Mat        A  = c->A,B = c->B;
811   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
812   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
813   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
814   PetscBT    table_i;
815 #if defined(PETSC_USE_CTABLE)
816   PetscTable         table_data_i;
817   PetscTablePosition tpos;
818   PetscInt           tcount,*tdata;
819 #else
820   PetscInt           *data_i;
821 #endif
822 
823   PetscFunctionBegin;
824   rstart = C->rmap->rstart;
825   cstart = C->cmap->rstart;
826   ai     = a->i;
827   aj     = a->j;
828   bi     = b->i;
829   bj     = b->j;
830   garray = c->garray;
831 
832   for (i=0; i<imax; i++) {
833 #if defined(PETSC_USE_CTABLE)
834     /* copy existing entries of table_data_i into tdata[] */
835     table_data_i = table_data[i];
836     PetscCall(PetscTableGetCount(table_data_i,&tcount));
837     PetscCheck(tcount == isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]);
838 
839     PetscCall(PetscMalloc1(tcount,&tdata));
840     PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos));
841     while (tpos) {
842       PetscCall(PetscTableGetNext(table_data_i,&tpos,&row,&j));
843       tdata[--j] = --row;
844       PetscCheck(j <= tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount);
845     }
846 #else
847     data_i  = data[i];
848 #endif
849     table_i = table[i];
850     isz_i   = isz[i];
851     max     = isz[i];
852 
853     for (j=0; j<max; j++) {
854 #if defined(PETSC_USE_CTABLE)
855       row   = tdata[j] - rstart;
856 #else
857       row   = data_i[j] - rstart;
858 #endif
859       start = ai[row];
860       end   = ai[row+1];
861       for (k=start; k<end; k++) { /* Amat */
862         val = aj[k] + cstart;
863         if (!PetscBTLookupSet(table_i,val)) {
864 #if defined(PETSC_USE_CTABLE)
865           PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES));
866 #else
867           data_i[isz_i] = val;
868 #endif
869           isz_i++;
870         }
871       }
872       start = bi[row];
873       end   = bi[row+1];
874       for (k=start; k<end; k++) { /* Bmat */
875         val = garray[bj[k]];
876         if (!PetscBTLookupSet(table_i,val)) {
877 #if defined(PETSC_USE_CTABLE)
878           PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES));
879 #else
880           data_i[isz_i] = val;
881 #endif
882           isz_i++;
883         }
884       }
885     }
886     isz[i] = isz_i;
887 
888 #if defined(PETSC_USE_CTABLE)
889     PetscCall(PetscFree(tdata));
890 #endif
891   }
892   PetscFunctionReturn(0);
893 }
894 
895 /*
896       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
897          and return the output
898 
899          Input:
900            C    - the matrix
901            nrqr - no of messages being processed.
902            rbuf - an array of pointers to the received requests
903 
904          Output:
905            xdata - array of messages to be sent back
906            isz1  - size of each message
907 
908   For better efficiency perhaps we should malloc separately each xdata[i],
909 then if a remalloc is required we need only copy the data for that one row
910 rather then all previous rows as it is now where a single large chunk of
911 memory is used.
912 
913 */
914 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
915 {
916   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
917   Mat            A  = c->A,B = c->B;
918   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
919   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
920   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
921   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
922   PetscInt       *rbuf_i,kmax,rbuf_0;
923   PetscBT        xtable;
924 
925   PetscFunctionBegin;
926   m      = C->rmap->N;
927   rstart = C->rmap->rstart;
928   cstart = C->cmap->rstart;
929   ai     = a->i;
930   aj     = a->j;
931   bi     = b->i;
932   bj     = b->j;
933   garray = c->garray;
934 
935   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
936     rbuf_i =  rbuf[i];
937     rbuf_0 =  rbuf_i[0];
938     ct    += rbuf_0;
939     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
940   }
941 
942   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
943   else max1 = 1;
944   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
945   if (nrqr) {
946     PetscCall(PetscMalloc1(mem_estimate,&xdata[0]));
947     ++no_malloc;
948   }
949   PetscCall(PetscBTCreate(m,&xtable));
950   PetscCall(PetscArrayzero(isz1,nrqr));
951 
952   ct3 = 0;
953   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
954     rbuf_i =  rbuf[i];
955     rbuf_0 =  rbuf_i[0];
956     ct1    =  2*rbuf_0+1;
957     ct2    =  ct1;
958     ct3   += ct1;
959     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
960       PetscCall(PetscBTMemzero(m,xtable));
961       oct2 = ct2;
962       kmax = rbuf_i[2*j];
963       for (k=0; k<kmax; k++,ct1++) {
964         row = rbuf_i[ct1];
965         if (!PetscBTLookupSet(xtable,row)) {
966           if (!(ct3 < mem_estimate)) {
967             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
968             PetscCall(PetscMalloc1(new_estimate,&tmp));
969             PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
970             PetscCall(PetscFree(xdata[0]));
971             xdata[0]     = tmp;
972             mem_estimate = new_estimate; ++no_malloc;
973             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
974           }
975           xdata[i][ct2++] = row;
976           ct3++;
977         }
978       }
979       for (k=oct2,max2=ct2; k<max2; k++) {
980         row   = xdata[i][k] - rstart;
981         start = ai[row];
982         end   = ai[row+1];
983         for (l=start; l<end; l++) {
984           val = aj[l] + cstart;
985           if (!PetscBTLookupSet(xtable,val)) {
986             if (!(ct3 < mem_estimate)) {
987               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
988               PetscCall(PetscMalloc1(new_estimate,&tmp));
989               PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
990               PetscCall(PetscFree(xdata[0]));
991               xdata[0]     = tmp;
992               mem_estimate = new_estimate; ++no_malloc;
993               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
994             }
995             xdata[i][ct2++] = val;
996             ct3++;
997           }
998         }
999         start = bi[row];
1000         end   = bi[row+1];
1001         for (l=start; l<end; l++) {
1002           val = garray[bj[l]];
1003           if (!PetscBTLookupSet(xtable,val)) {
1004             if (!(ct3 < mem_estimate)) {
1005               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1006               PetscCall(PetscMalloc1(new_estimate,&tmp));
1007               PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate));
1008               PetscCall(PetscFree(xdata[0]));
1009               xdata[0]     = tmp;
1010               mem_estimate = new_estimate; ++no_malloc;
1011               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1012             }
1013             xdata[i][ct2++] = val;
1014             ct3++;
1015           }
1016         }
1017       }
1018       /* Update the header*/
1019       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1020       xdata[i][2*j-1] = rbuf_i[2*j-1];
1021     }
1022     xdata[i][0] = rbuf_0;
1023     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1024     isz1[i]     = ct2; /* size of each message */
1025   }
1026   PetscCall(PetscBTDestroy(&xtable));
1027   PetscCall(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc));
1028   PetscFunctionReturn(0);
1029 }
1030 /* -------------------------------------------------------------------------*/
1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1032 /*
1033     Every processor gets the entire matrix
1034 */
1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1036 {
1037   Mat            B;
1038   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1039   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1040   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1041   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1042   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1043 
1044   PetscFunctionBegin;
1045   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1046   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank));
1047   if (scall == MAT_INITIAL_MATRIX) {
1048     /* ----------------------------------------------------------------
1049          Tell every processor the number of nonzeros per row
1050     */
1051     PetscCall(PetscMalloc1(A->rmap->N,&lens));
1052     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1053       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1054     }
1055     PetscCall(PetscMalloc2(size,&recvcounts,size,&displs));
1056     for (i=0; i<size; i++) {
1057       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1058       displs[i]     = A->rmap->range[i];
1059     }
1060     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A)));
1061     /* ---------------------------------------------------------------
1062          Create the sequential matrix of the same type as the local block diagonal
1063     */
1064     PetscCall(MatCreate(PETSC_COMM_SELF,&B));
1065     PetscCall(MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE));
1066     PetscCall(MatSetBlockSizesFromMats(B,A,A));
1067     PetscCall(MatSetType(B,((PetscObject)a->A)->type_name));
1068     PetscCall(MatSeqAIJSetPreallocation(B,0,lens));
1069     PetscCall(PetscCalloc1(2,Bin));
1070     **Bin = B;
1071     b     = (Mat_SeqAIJ*)B->data;
1072 
1073     /*--------------------------------------------------------------------
1074        Copy my part of matrix column indices over
1075     */
1076     sendcount  = ad->nz + bd->nz;
1077     jsendbuf   = b->j + b->i[rstarts[rank]];
1078     a_jsendbuf = ad->j;
1079     b_jsendbuf = bd->j;
1080     n          = A->rmap->rend - A->rmap->rstart;
1081     cnt        = 0;
1082     for (i=0; i<n; i++) {
1083       /* put in lower diagonal portion */
1084       m = bd->i[i+1] - bd->i[i];
1085       while (m > 0) {
1086         /* is it above diagonal (in bd (compressed) numbering) */
1087         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1088         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1089         m--;
1090       }
1091 
1092       /* put in diagonal portion */
1093       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1094         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1095       }
1096 
1097       /* put in upper diagonal portion */
1098       while (m-- > 0) {
1099         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1100       }
1101     }
1102     PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1103 
1104     /*--------------------------------------------------------------------
1105        Gather all column indices to all processors
1106     */
1107     for (i=0; i<size; i++) {
1108       recvcounts[i] = 0;
1109       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1110         recvcounts[i] += lens[j];
1111       }
1112     }
1113     displs[0] = 0;
1114     for (i=1; i<size; i++) {
1115       displs[i] = displs[i-1] + recvcounts[i-1];
1116     }
1117     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A)));
1118     /*--------------------------------------------------------------------
1119         Assemble the matrix into useable form (note numerical values not yet set)
1120     */
1121     /* set the b->ilen (length of each row) values */
1122     PetscCall(PetscArraycpy(b->ilen,lens,A->rmap->N));
1123     /* set the b->i indices */
1124     b->i[0] = 0;
1125     for (i=1; i<=A->rmap->N; i++) {
1126       b->i[i] = b->i[i-1] + lens[i-1];
1127     }
1128     PetscCall(PetscFree(lens));
1129     PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1130     PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1131 
1132   } else {
1133     B = **Bin;
1134     b = (Mat_SeqAIJ*)B->data;
1135   }
1136 
1137   /*--------------------------------------------------------------------
1138        Copy my part of matrix numerical values into the values location
1139   */
1140   if (flag == MAT_GET_VALUES) {
1141     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1142     MatScalar         *sendbuf,*recvbuf;
1143 
1144     PetscCall(MatSeqAIJGetArrayRead(a->A,&ada));
1145     PetscCall(MatSeqAIJGetArrayRead(a->B,&bda));
1146     sendcount = ad->nz + bd->nz;
1147     sendbuf   = b->a + b->i[rstarts[rank]];
1148     a_sendbuf = ada;
1149     b_sendbuf = bda;
1150     b_sendj   = bd->j;
1151     n         = A->rmap->rend - A->rmap->rstart;
1152     cnt       = 0;
1153     for (i=0; i<n; i++) {
1154       /* put in lower diagonal portion */
1155       m = bd->i[i+1] - bd->i[i];
1156       while (m > 0) {
1157         /* is it above diagonal (in bd (compressed) numbering) */
1158         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1159         sendbuf[cnt++] = *b_sendbuf++;
1160         m--;
1161         b_sendj++;
1162       }
1163 
1164       /* put in diagonal portion */
1165       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1166         sendbuf[cnt++] = *a_sendbuf++;
1167       }
1168 
1169       /* put in upper diagonal portion */
1170       while (m-- > 0) {
1171         sendbuf[cnt++] = *b_sendbuf++;
1172         b_sendj++;
1173       }
1174     }
1175     PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1176 
1177     /* -----------------------------------------------------------------
1178        Gather all numerical values to all processors
1179     */
1180     if (!recvcounts) {
1181       PetscCall(PetscMalloc2(size,&recvcounts,size,&displs));
1182     }
1183     for (i=0; i<size; i++) {
1184       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1185     }
1186     displs[0] = 0;
1187     for (i=1; i<size; i++) {
1188       displs[i] = displs[i-1] + recvcounts[i-1];
1189     }
1190     recvbuf = b->a;
1191     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A)));
1192     PetscCall(MatSeqAIJRestoreArrayRead(a->A,&ada));
1193     PetscCall(MatSeqAIJRestoreArrayRead(a->B,&bda));
1194   }  /* endof (flag == MAT_GET_VALUES) */
1195   PetscCall(PetscFree2(recvcounts,displs));
1196 
1197   if (A->symmetric) PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE));
1198   else if (A->hermitian) PetscCall(MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE));
1199   else if (A->structurally_symmetric) PetscCall(MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE));
1200   PetscFunctionReturn(0);
1201 }
1202 
1203 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1204 {
1205   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1206   Mat            submat,A = c->A,B = c->B;
1207   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1208   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1209   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1210   const PetscInt *icol,*irow;
1211   PetscInt       nrow,ncol,start;
1212   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1213   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1214   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1215   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1216   PetscInt       *lens,rmax,ncols,*cols,Crow;
1217 #if defined(PETSC_USE_CTABLE)
1218   PetscTable     cmap,rmap;
1219   PetscInt       *cmap_loc,*rmap_loc;
1220 #else
1221   PetscInt       *cmap,*rmap;
1222 #endif
1223   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1224   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1225   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1226   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1227   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1228   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1229   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1230   MPI_Comm       comm;
1231   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1232   PetscMPIInt    *onodes1,*olengths1,idex,end;
1233   Mat_SubSppt    *smatis1;
1234   PetscBool      isrowsorted,iscolsorted;
1235 
1236   PetscFunctionBegin;
1237   PetscValidLogicalCollectiveInt(C,ismax,2);
1238   PetscValidLogicalCollectiveEnum(C,scall,5);
1239   PetscCheck(ismax == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1240   PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a));
1241   PetscCall(MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a));
1242   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
1243   size = c->size;
1244   rank = c->rank;
1245 
1246   PetscCall(ISSorted(iscol[0],&iscolsorted));
1247   PetscCall(ISSorted(isrow[0],&isrowsorted));
1248   PetscCall(ISGetIndices(isrow[0],&irow));
1249   PetscCall(ISGetLocalSize(isrow[0],&nrow));
1250   if (allcolumns) {
1251     icol = NULL;
1252     ncol = C->cmap->N;
1253   } else {
1254     PetscCall(ISGetIndices(iscol[0],&icol));
1255     PetscCall(ISGetLocalSize(iscol[0],&ncol));
1256   }
1257 
1258   if (scall == MAT_INITIAL_MATRIX) {
1259     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1260 
1261     /* Get some new tags to keep the communication clean */
1262     tag1 = ((PetscObject)C)->tag;
1263     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
1264     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3));
1265 
1266     /* evaluate communication - mesg to who, length of mesg, and buffer space
1267      required. Based on this, buffers are allocated, and data copied into them */
1268     PetscCall(PetscCalloc2(size,&w1,size,&w2));
1269     PetscCall(PetscMalloc1(nrow,&row2proc));
1270 
1271     /* w1[proc] = num of rows owned by proc -- to be requested */
1272     proc = 0;
1273     nrqs = 0; /* num of outgoing messages */
1274     for (j=0; j<nrow; j++) {
1275       row  = irow[j];
1276       if (!isrowsorted) proc = 0;
1277       while (row >= C->rmap->range[proc+1]) proc++;
1278       w1[proc]++;
1279       row2proc[j] = proc; /* map row index to proc */
1280 
1281       if (proc != rank && !w2[proc]) {
1282         w2[proc] = 1; nrqs++;
1283       }
1284     }
1285     w1[rank] = 0;  /* rows owned by self will not be requested */
1286 
1287     PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/
1288     for (proc=0,j=0; proc<size; proc++) {
1289       if (w1[proc]) { pa[j++] = proc;}
1290     }
1291 
1292     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1293     msz = 0;              /* total mesg length (for all procs) */
1294     for (i=0; i<nrqs; i++) {
1295       proc      = pa[i];
1296       w1[proc] += 3;
1297       msz      += w1[proc];
1298     }
1299     PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz));
1300 
1301     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1302     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1303     PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
1304 
1305     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1306        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1307     PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
1308 
1309     /* Now post the Irecvs corresponding to these messages */
1310     PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1));
1311 
1312     PetscCall(PetscFree(onodes1));
1313     PetscCall(PetscFree(olengths1));
1314 
1315     /* Allocate Memory for outgoing messages */
1316     PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
1317     PetscCall(PetscArrayzero(sbuf1,size));
1318     PetscCall(PetscArrayzero(ptr,size));
1319 
1320     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1321     iptr = tmp;
1322     for (i=0; i<nrqs; i++) {
1323       proc        = pa[i];
1324       sbuf1[proc] = iptr;
1325       iptr       += w1[proc];
1326     }
1327 
1328     /* Form the outgoing messages */
1329     /* Initialize the header space */
1330     for (i=0; i<nrqs; i++) {
1331       proc      = pa[i];
1332       PetscCall(PetscArrayzero(sbuf1[proc],3));
1333       ptr[proc] = sbuf1[proc] + 3;
1334     }
1335 
1336     /* Parse the isrow and copy data into outbuf */
1337     PetscCall(PetscArrayzero(ctr,size));
1338     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1339       proc = row2proc[j];
1340       if (proc != rank) { /* copy to the outgoing buf*/
1341         *ptr[proc] = irow[j];
1342         ctr[proc]++; ptr[proc]++;
1343       }
1344     }
1345 
1346     /* Update the headers for the current IS */
1347     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1348       if ((ctr_j = ctr[j])) {
1349         sbuf1_j        = sbuf1[j];
1350         k              = ++sbuf1_j[0];
1351         sbuf1_j[2*k]   = ctr_j;
1352         sbuf1_j[2*k-1] = 0;
1353       }
1354     }
1355 
1356     /* Now post the sends */
1357     PetscCall(PetscMalloc1(nrqs,&s_waits1));
1358     for (i=0; i<nrqs; ++i) {
1359       proc = pa[i];
1360       PetscCallMPI(MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i));
1361     }
1362 
1363     /* Post Receives to capture the buffer size */
1364     PetscCall(PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2));
1365     PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3));
1366 
1367     if (nrqs) rbuf2[0] = tmp + msz;
1368     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1369 
1370     for (i=0; i<nrqs; ++i) {
1371       proc = pa[i];
1372       PetscCallMPI(MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i));
1373     }
1374 
1375     PetscCall(PetscFree2(w1,w2));
1376 
1377     /* Send to other procs the buf size they should allocate */
1378     /* Receive messages*/
1379     PetscCall(PetscMalloc1(nrqr,&r_status1));
1380     PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1));
1381 
1382     PetscCallMPI(MPI_Waitall(nrqr,r_waits1,r_status1));
1383     for (i=0; i<nrqr; ++i) {
1384       req_size[i] = 0;
1385       rbuf1_i        = rbuf1[i];
1386       start          = 2*rbuf1_i[0] + 1;
1387       PetscCallMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end));
1388       PetscCall(PetscMalloc1(end,&sbuf2[i]));
1389       sbuf2_i        = sbuf2[i];
1390       for (j=start; j<end; j++) {
1391         k            = rbuf1_i[j] - rstart;
1392         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1393         sbuf2_i[j]   = ncols;
1394         req_size[i] += ncols;
1395       }
1396       req_source1[i] = r_status1[i].MPI_SOURCE;
1397 
1398       /* form the header */
1399       sbuf2_i[0] = req_size[i];
1400       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1401 
1402       PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i));
1403     }
1404 
1405     PetscCall(PetscFree(r_status1));
1406     PetscCall(PetscFree(r_waits1));
1407 
1408     /* rbuf2 is received, Post recv column indices a->j */
1409     PetscCallMPI(MPI_Waitall(nrqs,r_waits2,r_status2));
1410 
1411     PetscCall(PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3));
1412     for (i=0; i<nrqs; ++i) {
1413       PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i]));
1414       req_source2[i] = r_status2[i].MPI_SOURCE;
1415       PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i));
1416     }
1417 
1418     /* Wait on sends1 and sends2 */
1419     PetscCall(PetscMalloc1(nrqs,&s_status1));
1420     PetscCallMPI(MPI_Waitall(nrqs,s_waits1,s_status1));
1421     PetscCall(PetscFree(s_waits1));
1422     PetscCall(PetscFree(s_status1));
1423 
1424     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,s_status2));
1425     PetscCall(PetscFree4(r_status2,s_waits2,r_waits2,s_status2));
1426 
1427     /* Now allocate sending buffers for a->j, and send them off */
1428     PetscCall(PetscMalloc1(nrqr,&sbuf_aj));
1429     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1430     if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0]));
1431     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1432 
1433     for (i=0; i<nrqr; i++) { /* for each requested message */
1434       rbuf1_i   = rbuf1[i];
1435       sbuf_aj_i = sbuf_aj[i];
1436       ct1       = 2*rbuf1_i[0] + 1;
1437       ct2       = 0;
1438       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1439 
1440       kmax = rbuf1[i][2];
1441       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1442         row    = rbuf1_i[ct1] - rstart;
1443         nzA    = ai[row+1] - ai[row];
1444         nzB    = bi[row+1] - bi[row];
1445         ncols  = nzA + nzB;
1446         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1447 
1448         /* load the column indices for this row into cols*/
1449         cols = sbuf_aj_i + ct2;
1450 
1451         lwrite = 0;
1452         for (l=0; l<nzB; l++) {
1453           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1454         }
1455         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1456         for (l=0; l<nzB; l++) {
1457           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1458         }
1459 
1460         ct2 += ncols;
1461       }
1462       PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i));
1463     }
1464 
1465     /* create column map (cmap): global col of C -> local col of submat */
1466 #if defined(PETSC_USE_CTABLE)
1467     if (!allcolumns) {
1468       PetscCall(PetscTableCreate(ncol,C->cmap->N,&cmap));
1469       PetscCall(PetscCalloc1(C->cmap->n,&cmap_loc));
1470       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1471         if (icol[j] >= cstart && icol[j] <cend) {
1472           cmap_loc[icol[j] - cstart] = j+1;
1473         } else { /* use PetscTable for non-local col indices */
1474           PetscCall(PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES));
1475         }
1476       }
1477     } else {
1478       cmap     = NULL;
1479       cmap_loc = NULL;
1480     }
1481     PetscCall(PetscCalloc1(C->rmap->n,&rmap_loc));
1482 #else
1483     if (!allcolumns) {
1484       PetscCall(PetscCalloc1(C->cmap->N,&cmap));
1485       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1486     } else {
1487       cmap = NULL;
1488     }
1489 #endif
1490 
1491     /* Create lens for MatSeqAIJSetPreallocation() */
1492     PetscCall(PetscCalloc1(nrow,&lens));
1493 
1494     /* Compute lens from local part of C */
1495     for (j=0; j<nrow; j++) {
1496       row  = irow[j];
1497       proc = row2proc[j];
1498       if (proc == rank) {
1499         /* diagonal part A = c->A */
1500         ncols = ai[row-rstart+1] - ai[row-rstart];
1501         cols  = aj + ai[row-rstart];
1502         if (!allcolumns) {
1503           for (k=0; k<ncols; k++) {
1504 #if defined(PETSC_USE_CTABLE)
1505             tcol = cmap_loc[cols[k]];
1506 #else
1507             tcol = cmap[cols[k]+cstart];
1508 #endif
1509             if (tcol) lens[j]++;
1510           }
1511         } else { /* allcolumns */
1512           lens[j] = ncols;
1513         }
1514 
1515         /* off-diagonal part B = c->B */
1516         ncols = bi[row-rstart+1] - bi[row-rstart];
1517         cols  = bj + bi[row-rstart];
1518         if (!allcolumns) {
1519           for (k=0; k<ncols; k++) {
1520 #if defined(PETSC_USE_CTABLE)
1521             PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol));
1522 #else
1523             tcol = cmap[bmap[cols[k]]];
1524 #endif
1525             if (tcol) lens[j]++;
1526           }
1527         } else { /* allcolumns */
1528           lens[j] += ncols;
1529         }
1530       }
1531     }
1532 
1533     /* Create row map (rmap): global row of C -> local row of submat */
1534 #if defined(PETSC_USE_CTABLE)
1535     PetscCall(PetscTableCreate(nrow,C->rmap->N,&rmap));
1536     for (j=0; j<nrow; j++) {
1537       row  = irow[j];
1538       proc = row2proc[j];
1539       if (proc == rank) { /* a local row */
1540         rmap_loc[row - rstart] = j;
1541       } else {
1542         PetscCall(PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES));
1543       }
1544     }
1545 #else
1546     PetscCall(PetscCalloc1(C->rmap->N,&rmap));
1547     for (j=0; j<nrow; j++) {
1548       rmap[irow[j]] = j;
1549     }
1550 #endif
1551 
1552     /* Update lens from offproc data */
1553     /* recv a->j is done */
1554     PetscCallMPI(MPI_Waitall(nrqs,r_waits3,r_status3));
1555     for (i=0; i<nrqs; i++) {
1556       proc    = pa[i];
1557       sbuf1_i = sbuf1[proc];
1558       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1559       ct1     = 2 + 1;
1560       ct2     = 0;
1561       rbuf2_i = rbuf2[i]; /* received length of C->j */
1562       rbuf3_i = rbuf3[i]; /* received C->j */
1563 
1564       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1565       max1   = sbuf1_i[2];
1566       for (k=0; k<max1; k++,ct1++) {
1567 #if defined(PETSC_USE_CTABLE)
1568         PetscCall(PetscTableFind(rmap,sbuf1_i[ct1]+1,&row));
1569         row--;
1570         PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1571 #else
1572         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1573 #endif
1574         /* Now, store row index of submat in sbuf1_i[ct1] */
1575         sbuf1_i[ct1] = row;
1576 
1577         nnz = rbuf2_i[ct1];
1578         if (!allcolumns) {
1579           for (l=0; l<nnz; l++,ct2++) {
1580 #if defined(PETSC_USE_CTABLE)
1581             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1582               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1583             } else {
1584               PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol));
1585             }
1586 #else
1587             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1588 #endif
1589             if (tcol) lens[row]++;
1590           }
1591         } else { /* allcolumns */
1592           lens[row] += nnz;
1593         }
1594       }
1595     }
1596     PetscCallMPI(MPI_Waitall(nrqr,s_waits3,s_status3));
1597     PetscCall(PetscFree4(r_waits3,s_waits3,r_status3,s_status3));
1598 
1599     /* Create the submatrices */
1600     PetscCall(MatCreate(PETSC_COMM_SELF,&submat));
1601     PetscCall(MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE));
1602 
1603     PetscCall(ISGetBlockSize(isrow[0],&i));
1604     PetscCall(ISGetBlockSize(iscol[0],&j));
1605     PetscCall(MatSetBlockSizes(submat,i,j));
1606     PetscCall(MatSetType(submat,((PetscObject)A)->type_name));
1607     PetscCall(MatSeqAIJSetPreallocation(submat,0,lens));
1608 
1609     /* create struct Mat_SubSppt and attached it to submat */
1610     PetscCall(PetscNew(&smatis1));
1611     subc = (Mat_SeqAIJ*)submat->data;
1612     subc->submatis1 = smatis1;
1613 
1614     smatis1->id          = 0;
1615     smatis1->nrqs        = nrqs;
1616     smatis1->nrqr        = nrqr;
1617     smatis1->rbuf1       = rbuf1;
1618     smatis1->rbuf2       = rbuf2;
1619     smatis1->rbuf3       = rbuf3;
1620     smatis1->sbuf2       = sbuf2;
1621     smatis1->req_source2 = req_source2;
1622 
1623     smatis1->sbuf1       = sbuf1;
1624     smatis1->ptr         = ptr;
1625     smatis1->tmp         = tmp;
1626     smatis1->ctr         = ctr;
1627 
1628     smatis1->pa           = pa;
1629     smatis1->req_size     = req_size;
1630     smatis1->req_source1  = req_source1;
1631 
1632     smatis1->allcolumns  = allcolumns;
1633     smatis1->singleis    = PETSC_TRUE;
1634     smatis1->row2proc    = row2proc;
1635     smatis1->rmap        = rmap;
1636     smatis1->cmap        = cmap;
1637 #if defined(PETSC_USE_CTABLE)
1638     smatis1->rmap_loc    = rmap_loc;
1639     smatis1->cmap_loc    = cmap_loc;
1640 #endif
1641 
1642     smatis1->destroy     = submat->ops->destroy;
1643     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1644     submat->factortype   = C->factortype;
1645 
1646     /* compute rmax */
1647     rmax = 0;
1648     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1649 
1650   } else { /* scall == MAT_REUSE_MATRIX */
1651     submat = submats[0];
1652     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1653 
1654     subc    = (Mat_SeqAIJ*)submat->data;
1655     rmax    = subc->rmax;
1656     smatis1 = subc->submatis1;
1657     nrqs        = smatis1->nrqs;
1658     nrqr        = smatis1->nrqr;
1659     rbuf1       = smatis1->rbuf1;
1660     rbuf2       = smatis1->rbuf2;
1661     rbuf3       = smatis1->rbuf3;
1662     req_source2 = smatis1->req_source2;
1663 
1664     sbuf1     = smatis1->sbuf1;
1665     sbuf2     = smatis1->sbuf2;
1666     ptr       = smatis1->ptr;
1667     tmp       = smatis1->tmp;
1668     ctr       = smatis1->ctr;
1669 
1670     pa         = smatis1->pa;
1671     req_size   = smatis1->req_size;
1672     req_source1 = smatis1->req_source1;
1673 
1674     allcolumns = smatis1->allcolumns;
1675     row2proc   = smatis1->row2proc;
1676     rmap       = smatis1->rmap;
1677     cmap       = smatis1->cmap;
1678 #if defined(PETSC_USE_CTABLE)
1679     rmap_loc   = smatis1->rmap_loc;
1680     cmap_loc   = smatis1->cmap_loc;
1681 #endif
1682   }
1683 
1684   /* Post recv matrix values */
1685   PetscCall(PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals));
1686   PetscCall(PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4));
1687   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4));
1688   for (i=0; i<nrqs; ++i) {
1689     PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i]));
1690     PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i));
1691   }
1692 
1693   /* Allocate sending buffers for a->a, and send them off */
1694   PetscCall(PetscMalloc1(nrqr,&sbuf_aa));
1695   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1696   if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0]));
1697   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1698 
1699   for (i=0; i<nrqr; i++) {
1700     rbuf1_i   = rbuf1[i];
1701     sbuf_aa_i = sbuf_aa[i];
1702     ct1       = 2*rbuf1_i[0]+1;
1703     ct2       = 0;
1704     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1705 
1706     kmax = rbuf1_i[2];
1707     for (k=0; k<kmax; k++,ct1++) {
1708       row = rbuf1_i[ct1] - rstart;
1709       nzA = ai[row+1] - ai[row];
1710       nzB = bi[row+1] - bi[row];
1711       ncols  = nzA + nzB;
1712       cworkB = bj + bi[row];
1713       vworkA = a_a + ai[row];
1714       vworkB = b_a + bi[row];
1715 
1716       /* load the column values for this row into vals*/
1717       vals = sbuf_aa_i + ct2;
1718 
1719       lwrite = 0;
1720       for (l=0; l<nzB; l++) {
1721         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1722       }
1723       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1724       for (l=0; l<nzB; l++) {
1725         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1726       }
1727 
1728       ct2 += ncols;
1729     }
1730     PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i));
1731   }
1732 
1733   /* Assemble submat */
1734   /* First assemble the local rows */
1735   for (j=0; j<nrow; j++) {
1736     row  = irow[j];
1737     proc = row2proc[j];
1738     if (proc == rank) {
1739       Crow = row - rstart;  /* local row index of C */
1740 #if defined(PETSC_USE_CTABLE)
1741       row = rmap_loc[Crow]; /* row index of submat */
1742 #else
1743       row = rmap[row];
1744 #endif
1745 
1746       if (allcolumns) {
1747         /* diagonal part A = c->A */
1748         ncols = ai[Crow+1] - ai[Crow];
1749         cols  = aj + ai[Crow];
1750         vals  = a_a + ai[Crow];
1751         i     = 0;
1752         for (k=0; k<ncols; k++) {
1753           subcols[i]   = cols[k] + cstart;
1754           subvals[i++] = vals[k];
1755         }
1756 
1757         /* off-diagonal part B = c->B */
1758         ncols = bi[Crow+1] - bi[Crow];
1759         cols  = bj + bi[Crow];
1760         vals  = b_a + bi[Crow];
1761         for (k=0; k<ncols; k++) {
1762           subcols[i]   = bmap[cols[k]];
1763           subvals[i++] = vals[k];
1764         }
1765 
1766         PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES));
1767 
1768       } else { /* !allcolumns */
1769 #if defined(PETSC_USE_CTABLE)
1770         /* diagonal part A = c->A */
1771         ncols = ai[Crow+1] - ai[Crow];
1772         cols  = aj + ai[Crow];
1773         vals  = a_a + ai[Crow];
1774         i     = 0;
1775         for (k=0; k<ncols; k++) {
1776           tcol = cmap_loc[cols[k]];
1777           if (tcol) {
1778             subcols[i]   = --tcol;
1779             subvals[i++] = vals[k];
1780           }
1781         }
1782 
1783         /* off-diagonal part B = c->B */
1784         ncols = bi[Crow+1] - bi[Crow];
1785         cols  = bj + bi[Crow];
1786         vals  = b_a + bi[Crow];
1787         for (k=0; k<ncols; k++) {
1788           PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol));
1789           if (tcol) {
1790             subcols[i]   = --tcol;
1791             subvals[i++] = vals[k];
1792           }
1793         }
1794 #else
1795         /* diagonal part A = c->A */
1796         ncols = ai[Crow+1] - ai[Crow];
1797         cols  = aj + ai[Crow];
1798         vals  = a_a + ai[Crow];
1799         i     = 0;
1800         for (k=0; k<ncols; k++) {
1801           tcol = cmap[cols[k]+cstart];
1802           if (tcol) {
1803             subcols[i]   = --tcol;
1804             subvals[i++] = vals[k];
1805           }
1806         }
1807 
1808         /* off-diagonal part B = c->B */
1809         ncols = bi[Crow+1] - bi[Crow];
1810         cols  = bj + bi[Crow];
1811         vals  = b_a + bi[Crow];
1812         for (k=0; k<ncols; k++) {
1813           tcol = cmap[bmap[cols[k]]];
1814           if (tcol) {
1815             subcols[i]   = --tcol;
1816             subvals[i++] = vals[k];
1817           }
1818         }
1819 #endif
1820         PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES));
1821       }
1822     }
1823   }
1824 
1825   /* Now assemble the off-proc rows */
1826   for (i=0; i<nrqs; i++) { /* for each requested message */
1827     /* recv values from other processes */
1828     PetscCallMPI(MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i));
1829     proc    = pa[idex];
1830     sbuf1_i = sbuf1[proc];
1831     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1832     ct1     = 2 + 1;
1833     ct2     = 0; /* count of received C->j */
1834     ct3     = 0; /* count of received C->j that will be inserted into submat */
1835     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1836     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1837     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1838 
1839     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1840     max1 = sbuf1_i[2];             /* num of rows */
1841     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1842       row = sbuf1_i[ct1]; /* row index of submat */
1843       if (!allcolumns) {
1844         idex = 0;
1845         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1846           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1847           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1848 #if defined(PETSC_USE_CTABLE)
1849             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1850               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1851             } else {
1852               PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol));
1853             }
1854 #else
1855             tcol = cmap[rbuf3_i[ct2]];
1856 #endif
1857             if (tcol) {
1858               subcols[idex]   = --tcol; /* may not be sorted */
1859               subvals[idex++] = rbuf4_i[ct2];
1860 
1861               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1862                For reuse, we replace received C->j with index that should be inserted to submat */
1863               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1864             }
1865           }
1866           PetscCall(MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES));
1867         } else { /* scall == MAT_REUSE_MATRIX */
1868           submat = submats[0];
1869           subc   = (Mat_SeqAIJ*)submat->data;
1870 
1871           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1872           for (l=0; l<nnz; l++) {
1873             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1874             subvals[idex++] = rbuf4_i[ct2];
1875           }
1876 
1877           bj = subc->j + subc->i[row]; /* sorted column indices */
1878           PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES));
1879         }
1880       } else { /* allcolumns */
1881         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1882         PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES));
1883         ct2 += nnz;
1884       }
1885     }
1886   }
1887 
1888   /* sending a->a are done */
1889   PetscCallMPI(MPI_Waitall(nrqr,s_waits4,s_status4));
1890   PetscCall(PetscFree4(r_waits4,s_waits4,r_status4,s_status4));
1891 
1892   PetscCall(MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY));
1893   PetscCall(MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY));
1894   submats[0] = submat;
1895 
1896   /* Restore the indices */
1897   PetscCall(ISRestoreIndices(isrow[0],&irow));
1898   if (!allcolumns) {
1899     PetscCall(ISRestoreIndices(iscol[0],&icol));
1900   }
1901 
1902   /* Destroy allocated memory */
1903   for (i=0; i<nrqs; ++i) {
1904     PetscCall(PetscFree(rbuf4[i]));
1905   }
1906   PetscCall(PetscFree3(rbuf4,subcols,subvals));
1907   if (sbuf_aa) {
1908     PetscCall(PetscFree(sbuf_aa[0]));
1909     PetscCall(PetscFree(sbuf_aa));
1910   }
1911 
1912   if (scall == MAT_INITIAL_MATRIX) {
1913     PetscCall(PetscFree(lens));
1914     if (sbuf_aj) {
1915       PetscCall(PetscFree(sbuf_aj[0]));
1916       PetscCall(PetscFree(sbuf_aj));
1917     }
1918   }
1919   PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a));
1920   PetscCall(MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a));
1921   PetscFunctionReturn(0);
1922 }
1923 
1924 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1925 {
1926   PetscInt       ncol;
1927   PetscBool      colflag,allcolumns=PETSC_FALSE;
1928 
1929   PetscFunctionBegin;
1930   /* Allocate memory to hold all the submatrices */
1931   if (scall == MAT_INITIAL_MATRIX) {
1932     PetscCall(PetscCalloc1(2,submat));
1933   }
1934 
1935   /* Check for special case: each processor gets entire matrix columns */
1936   PetscCall(ISIdentity(iscol[0],&colflag));
1937   PetscCall(ISGetLocalSize(iscol[0],&ncol));
1938   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1939 
1940   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat));
1941   PetscFunctionReturn(0);
1942 }
1943 
1944 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1945 {
1946   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1947   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1948   Mat_SeqAIJ     *subc;
1949   Mat_SubSppt    *smat;
1950 
1951   PetscFunctionBegin;
1952   /* Check for special case: each processor has a single IS */
1953   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1954     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat));
1955     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1956     PetscFunctionReturn(0);
1957   }
1958 
1959   /* Collect global wantallmatrix and nstages */
1960   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1961   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1962   if (!nmax) nmax = 1;
1963 
1964   if (scall == MAT_INITIAL_MATRIX) {
1965     /* Collect global wantallmatrix and nstages */
1966     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1967       PetscCall(ISIdentity(*isrow,&rowflag));
1968       PetscCall(ISIdentity(*iscol,&colflag));
1969       PetscCall(ISGetLocalSize(*isrow,&nrow));
1970       PetscCall(ISGetLocalSize(*iscol,&ncol));
1971       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1972         wantallmatrix = PETSC_TRUE;
1973 
1974         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL));
1975       }
1976     }
1977 
1978     /* Determine the number of stages through which submatrices are done
1979        Each stage will extract nmax submatrices.
1980        nmax is determined by the matrix column dimension.
1981        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1982     */
1983     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1984 
1985     in[0] = -1*(PetscInt)wantallmatrix;
1986     in[1] = nstages;
1987     PetscCall(MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
1988     wantallmatrix = (PetscBool)(-out[0]);
1989     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1990 
1991   } else { /* MAT_REUSE_MATRIX */
1992     if (ismax) {
1993       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
1994       smat = subc->submatis1;
1995     } else { /* (*submat)[0] is a dummy matrix */
1996       smat = (Mat_SubSppt*)(*submat)[0]->data;
1997     }
1998     if (!smat) {
1999       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2000       wantallmatrix = PETSC_TRUE;
2001     } else if (smat->singleis) {
2002       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat));
2003       PetscFunctionReturn(0);
2004     } else {
2005       nstages = smat->nstages;
2006     }
2007   }
2008 
2009   if (wantallmatrix) {
2010     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat));
2011     PetscFunctionReturn(0);
2012   }
2013 
2014   /* Allocate memory to hold all the submatrices and dummy submatrices */
2015   if (scall == MAT_INITIAL_MATRIX) {
2016     PetscCall(PetscCalloc1(ismax+nstages,submat));
2017   }
2018 
2019   for (i=0,pos=0; i<nstages; i++) {
2020     if (pos+nmax <= ismax) max_no = nmax;
2021     else if (pos >= ismax) max_no = 0;
2022     else                   max_no = ismax-pos;
2023 
2024     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos));
2025     if (!max_no) {
2026       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2027         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2028         smat->nstages = nstages;
2029       }
2030       pos++; /* advance to next dummy matrix if any */
2031     } else pos += max_no;
2032   }
2033 
2034   if (ismax && scall == MAT_INITIAL_MATRIX) {
2035     /* save nstages for reuse */
2036     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2037     smat = subc->submatis1;
2038     smat->nstages = nstages;
2039   }
2040   PetscFunctionReturn(0);
2041 }
2042 
2043 /* -------------------------------------------------------------------------*/
2044 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2045 {
2046   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2047   Mat            A  = c->A;
2048   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2049   const PetscInt **icol,**irow;
2050   PetscInt       *nrow,*ncol,start;
2051   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2052   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2053   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2054   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2055   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2056 #if defined(PETSC_USE_CTABLE)
2057   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2058 #else
2059   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2060 #endif
2061   const PetscInt *irow_i;
2062   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2063   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2064   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2065   MPI_Comm       comm;
2066   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2067   PetscMPIInt    *onodes1,*olengths1,end;
2068   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2069   Mat_SubSppt    *smat_i;
2070   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2071   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2072 
2073   PetscFunctionBegin;
2074   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
2075   size = c->size;
2076   rank = c->rank;
2077 
2078   PetscCall(PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns));
2079   PetscCall(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted));
2080 
2081   for (i=0; i<ismax; i++) {
2082     PetscCall(ISSorted(iscol[i],&issorted[i]));
2083     if (!issorted[i]) iscsorted = issorted[i];
2084 
2085     PetscCall(ISSorted(isrow[i],&issorted[i]));
2086 
2087     PetscCall(ISGetIndices(isrow[i],&irow[i]));
2088     PetscCall(ISGetLocalSize(isrow[i],&nrow[i]));
2089 
2090     /* Check for special case: allcolumn */
2091     PetscCall(ISIdentity(iscol[i],&colflag));
2092     PetscCall(ISGetLocalSize(iscol[i],&ncol[i]));
2093     if (colflag && ncol[i] == C->cmap->N) {
2094       allcolumns[i] = PETSC_TRUE;
2095       icol[i] = NULL;
2096     } else {
2097       allcolumns[i] = PETSC_FALSE;
2098       PetscCall(ISGetIndices(iscol[i],&icol[i]));
2099     }
2100   }
2101 
2102   if (scall == MAT_REUSE_MATRIX) {
2103     /* Assumes new rows are same length as the old rows */
2104     for (i=0; i<ismax; i++) {
2105       PetscCheck(submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i);
2106       subc = (Mat_SeqAIJ*)submats[i]->data;
2107       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2108 
2109       /* Initial matrix as if empty */
2110       PetscCall(PetscArrayzero(subc->ilen,submats[i]->rmap->n));
2111 
2112       smat_i   = subc->submatis1;
2113 
2114       nrqs        = smat_i->nrqs;
2115       nrqr        = smat_i->nrqr;
2116       rbuf1       = smat_i->rbuf1;
2117       rbuf2       = smat_i->rbuf2;
2118       rbuf3       = smat_i->rbuf3;
2119       req_source2 = smat_i->req_source2;
2120 
2121       sbuf1     = smat_i->sbuf1;
2122       sbuf2     = smat_i->sbuf2;
2123       ptr       = smat_i->ptr;
2124       tmp       = smat_i->tmp;
2125       ctr       = smat_i->ctr;
2126 
2127       pa          = smat_i->pa;
2128       req_size    = smat_i->req_size;
2129       req_source1 = smat_i->req_source1;
2130 
2131       allcolumns[i] = smat_i->allcolumns;
2132       row2proc[i]   = smat_i->row2proc;
2133       rmap[i]       = smat_i->rmap;
2134       cmap[i]       = smat_i->cmap;
2135     }
2136 
2137     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2138       PetscCheck(submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2139       smat_i = (Mat_SubSppt*)submats[0]->data;
2140 
2141       nrqs        = smat_i->nrqs;
2142       nrqr        = smat_i->nrqr;
2143       rbuf1       = smat_i->rbuf1;
2144       rbuf2       = smat_i->rbuf2;
2145       rbuf3       = smat_i->rbuf3;
2146       req_source2 = smat_i->req_source2;
2147 
2148       sbuf1       = smat_i->sbuf1;
2149       sbuf2       = smat_i->sbuf2;
2150       ptr         = smat_i->ptr;
2151       tmp         = smat_i->tmp;
2152       ctr         = smat_i->ctr;
2153 
2154       pa          = smat_i->pa;
2155       req_size    = smat_i->req_size;
2156       req_source1 = smat_i->req_source1;
2157 
2158       allcolumns[0] = PETSC_FALSE;
2159     }
2160   } else { /* scall == MAT_INITIAL_MATRIX */
2161     /* Get some new tags to keep the communication clean */
2162     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2));
2163     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3));
2164 
2165     /* evaluate communication - mesg to who, length of mesg, and buffer space
2166      required. Based on this, buffers are allocated, and data copied into them*/
2167     PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));   /* mesg size, initialize work vectors */
2168 
2169     for (i=0; i<ismax; i++) {
2170       jmax   = nrow[i];
2171       irow_i = irow[i];
2172 
2173       PetscCall(PetscMalloc1(jmax,&row2proc_i));
2174       row2proc[i] = row2proc_i;
2175 
2176       if (issorted[i]) proc = 0;
2177       for (j=0; j<jmax; j++) {
2178         if (!issorted[i]) proc = 0;
2179         row = irow_i[j];
2180         while (row >= C->rmap->range[proc+1]) proc++;
2181         w4[proc]++;
2182         row2proc_i[j] = proc; /* map row index to proc */
2183       }
2184       for (j=0; j<size; j++) {
2185         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2186       }
2187     }
2188 
2189     nrqs     = 0;              /* no of outgoing messages */
2190     msz      = 0;              /* total mesg length (for all procs) */
2191     w1[rank] = 0;              /* no mesg sent to self */
2192     w3[rank] = 0;
2193     for (i=0; i<size; i++) {
2194       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2195     }
2196     PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/
2197     for (i=0,j=0; i<size; i++) {
2198       if (w1[i]) { pa[j] = i; j++; }
2199     }
2200 
2201     /* Each message would have a header = 1 + 2*(no of IS) + data */
2202     for (i=0; i<nrqs; i++) {
2203       j      = pa[i];
2204       w1[j] += w2[j] + 2* w3[j];
2205       msz   += w1[j];
2206     }
2207     PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz));
2208 
2209     /* Determine the number of messages to expect, their lengths, from from-ids */
2210     PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
2211     PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
2212 
2213     /* Now post the Irecvs corresponding to these messages */
2214     PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag0));
2215     PetscCall(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1));
2216 
2217     /* Allocate Memory for outgoing messages */
2218     PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
2219     PetscCall(PetscArrayzero(sbuf1,size));
2220     PetscCall(PetscArrayzero(ptr,size));
2221 
2222     {
2223       PetscInt *iptr = tmp;
2224       k    = 0;
2225       for (i=0; i<nrqs; i++) {
2226         j        = pa[i];
2227         iptr    += k;
2228         sbuf1[j] = iptr;
2229         k        = w1[j];
2230       }
2231     }
2232 
2233     /* Form the outgoing messages. Initialize the header space */
2234     for (i=0; i<nrqs; i++) {
2235       j           = pa[i];
2236       sbuf1[j][0] = 0;
2237       PetscCall(PetscArrayzero(sbuf1[j]+1,2*w3[j]));
2238       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2239     }
2240 
2241     /* Parse the isrow and copy data into outbuf */
2242     for (i=0; i<ismax; i++) {
2243       row2proc_i = row2proc[i];
2244       PetscCall(PetscArrayzero(ctr,size));
2245       irow_i = irow[i];
2246       jmax   = nrow[i];
2247       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2248         proc = row2proc_i[j];
2249         if (proc != rank) { /* copy to the outgoing buf*/
2250           ctr[proc]++;
2251           *ptr[proc] = irow_i[j];
2252           ptr[proc]++;
2253         }
2254       }
2255       /* Update the headers for the current IS */
2256       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2257         if ((ctr_j = ctr[j])) {
2258           sbuf1_j        = sbuf1[j];
2259           k              = ++sbuf1_j[0];
2260           sbuf1_j[2*k]   = ctr_j;
2261           sbuf1_j[2*k-1] = i;
2262         }
2263       }
2264     }
2265 
2266     /*  Now  post the sends */
2267     PetscCall(PetscMalloc1(nrqs,&s_waits1));
2268     for (i=0; i<nrqs; ++i) {
2269       j    = pa[i];
2270       PetscCallMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i));
2271     }
2272 
2273     /* Post Receives to capture the buffer size */
2274     PetscCall(PetscMalloc1(nrqs,&r_waits2));
2275     PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3));
2276     if (nrqs) rbuf2[0] = tmp + msz;
2277     for (i=1; i<nrqs; ++i) {
2278       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2279     }
2280     for (i=0; i<nrqs; ++i) {
2281       j    = pa[i];
2282       PetscCallMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i));
2283     }
2284 
2285     /* Send to other procs the buf size they should allocate */
2286     /* Receive messages*/
2287     PetscCall(PetscMalloc1(nrqr,&s_waits2));
2288     PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1));
2289     {
2290       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2291       PetscInt   *sbuf2_i;
2292 
2293       PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE));
2294       for (i=0; i<nrqr; ++i) {
2295         req_size[i] = 0;
2296         rbuf1_i        = rbuf1[i];
2297         start          = 2*rbuf1_i[0] + 1;
2298         end            = olengths1[i];
2299         PetscCall(PetscMalloc1(end,&sbuf2[i]));
2300         sbuf2_i        = sbuf2[i];
2301         for (j=start; j<end; j++) {
2302           id              = rbuf1_i[j] - rstart;
2303           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2304           sbuf2_i[j]      = ncols;
2305           req_size[i] += ncols;
2306         }
2307         req_source1[i] = onodes1[i];
2308         /* form the header */
2309         sbuf2_i[0] = req_size[i];
2310         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2311 
2312         PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i));
2313       }
2314     }
2315 
2316     PetscCall(PetscFree(onodes1));
2317     PetscCall(PetscFree(olengths1));
2318     PetscCall(PetscFree(r_waits1));
2319     PetscCall(PetscFree4(w1,w2,w3,w4));
2320 
2321     /* Receive messages*/
2322     PetscCall(PetscMalloc1(nrqs,&r_waits3));
2323     PetscCallMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE));
2324     for (i=0; i<nrqs; ++i) {
2325       PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i]));
2326       req_source2[i] = pa[i];
2327       PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i));
2328     }
2329     PetscCall(PetscFree(r_waits2));
2330 
2331     /* Wait on sends1 and sends2 */
2332     PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
2333     PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
2334     PetscCall(PetscFree(s_waits1));
2335     PetscCall(PetscFree(s_waits2));
2336 
2337     /* Now allocate sending buffers for a->j, and send them off */
2338     PetscCall(PetscMalloc1(nrqr,&sbuf_aj));
2339     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2340     if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0]));
2341     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2342 
2343     PetscCall(PetscMalloc1(nrqr,&s_waits3));
2344     {
2345       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2346       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2347       PetscInt cend = C->cmap->rend;
2348       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2349 
2350       for (i=0; i<nrqr; i++) {
2351         rbuf1_i   = rbuf1[i];
2352         sbuf_aj_i = sbuf_aj[i];
2353         ct1       = 2*rbuf1_i[0] + 1;
2354         ct2       = 0;
2355         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2356           kmax = rbuf1[i][2*j];
2357           for (k=0; k<kmax; k++,ct1++) {
2358             row    = rbuf1_i[ct1] - rstart;
2359             nzA    = a_i[row+1] - a_i[row];
2360             nzB    = b_i[row+1] - b_i[row];
2361             ncols  = nzA + nzB;
2362             cworkA = a_j + a_i[row];
2363             cworkB = b_j + b_i[row];
2364 
2365             /* load the column indices for this row into cols */
2366             cols = sbuf_aj_i + ct2;
2367 
2368             lwrite = 0;
2369             for (l=0; l<nzB; l++) {
2370               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2371             }
2372             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2373             for (l=0; l<nzB; l++) {
2374               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2375             }
2376 
2377             ct2 += ncols;
2378           }
2379         }
2380         PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i));
2381       }
2382     }
2383 
2384     /* create col map: global col of C -> local col of submatrices */
2385     {
2386       const PetscInt *icol_i;
2387 #if defined(PETSC_USE_CTABLE)
2388       for (i=0; i<ismax; i++) {
2389         if (!allcolumns[i]) {
2390           PetscCall(PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]));
2391 
2392           jmax   = ncol[i];
2393           icol_i = icol[i];
2394           cmap_i = cmap[i];
2395           for (j=0; j<jmax; j++) {
2396             PetscCall(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES));
2397           }
2398         } else cmap[i] = NULL;
2399       }
2400 #else
2401       for (i=0; i<ismax; i++) {
2402         if (!allcolumns[i]) {
2403           PetscCall(PetscCalloc1(C->cmap->N,&cmap[i]));
2404           jmax   = ncol[i];
2405           icol_i = icol[i];
2406           cmap_i = cmap[i];
2407           for (j=0; j<jmax; j++) {
2408             cmap_i[icol_i[j]] = j+1;
2409           }
2410         } else cmap[i] = NULL;
2411       }
2412 #endif
2413     }
2414 
2415     /* Create lens which is required for MatCreate... */
2416     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2417     PetscCall(PetscMalloc1(ismax,&lens));
2418 
2419     if (ismax) {
2420       PetscCall(PetscCalloc1(j,&lens[0]));
2421     }
2422     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2423 
2424     /* Update lens from local data */
2425     for (i=0; i<ismax; i++) {
2426       row2proc_i = row2proc[i];
2427       jmax = nrow[i];
2428       if (!allcolumns[i]) cmap_i = cmap[i];
2429       irow_i = irow[i];
2430       lens_i = lens[i];
2431       for (j=0; j<jmax; j++) {
2432         row = irow_i[j];
2433         proc = row2proc_i[j];
2434         if (proc == rank) {
2435           PetscCall(MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL));
2436           if (!allcolumns[i]) {
2437             for (k=0; k<ncols; k++) {
2438 #if defined(PETSC_USE_CTABLE)
2439               PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol));
2440 #else
2441               tcol = cmap_i[cols[k]];
2442 #endif
2443               if (tcol) lens_i[j]++;
2444             }
2445           } else { /* allcolumns */
2446             lens_i[j] = ncols;
2447           }
2448           PetscCall(MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL));
2449         }
2450       }
2451     }
2452 
2453     /* Create row map: global row of C -> local row of submatrices */
2454 #if defined(PETSC_USE_CTABLE)
2455     for (i=0; i<ismax; i++) {
2456       PetscCall(PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]));
2457       irow_i = irow[i];
2458       jmax   = nrow[i];
2459       for (j=0; j<jmax; j++) {
2460       PetscCall(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES));
2461       }
2462     }
2463 #else
2464     for (i=0; i<ismax; i++) {
2465       PetscCall(PetscCalloc1(C->rmap->N,&rmap[i]));
2466       rmap_i = rmap[i];
2467       irow_i = irow[i];
2468       jmax   = nrow[i];
2469       for (j=0; j<jmax; j++) {
2470         rmap_i[irow_i[j]] = j;
2471       }
2472     }
2473 #endif
2474 
2475     /* Update lens from offproc data */
2476     {
2477       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2478 
2479       PetscCallMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE));
2480       for (tmp2=0; tmp2<nrqs; tmp2++) {
2481         sbuf1_i = sbuf1[pa[tmp2]];
2482         jmax    = sbuf1_i[0];
2483         ct1     = 2*jmax+1;
2484         ct2     = 0;
2485         rbuf2_i = rbuf2[tmp2];
2486         rbuf3_i = rbuf3[tmp2];
2487         for (j=1; j<=jmax; j++) {
2488           is_no  = sbuf1_i[2*j-1];
2489           max1   = sbuf1_i[2*j];
2490           lens_i = lens[is_no];
2491           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2492           rmap_i = rmap[is_no];
2493           for (k=0; k<max1; k++,ct1++) {
2494 #if defined(PETSC_USE_CTABLE)
2495             PetscCall(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row));
2496             row--;
2497             PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2498 #else
2499             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2500 #endif
2501             max2 = rbuf2_i[ct1];
2502             for (l=0; l<max2; l++,ct2++) {
2503               if (!allcolumns[is_no]) {
2504 #if defined(PETSC_USE_CTABLE)
2505                 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
2506 #else
2507                 tcol = cmap_i[rbuf3_i[ct2]];
2508 #endif
2509                 if (tcol) lens_i[row]++;
2510               } else { /* allcolumns */
2511                 lens_i[row]++; /* lens_i[row] += max2 ? */
2512               }
2513             }
2514           }
2515         }
2516       }
2517     }
2518     PetscCall(PetscFree(r_waits3));
2519     PetscCallMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE));
2520     PetscCall(PetscFree(s_waits3));
2521 
2522     /* Create the submatrices */
2523     for (i=0; i<ismax; i++) {
2524       PetscInt    rbs,cbs;
2525 
2526       PetscCall(ISGetBlockSize(isrow[i],&rbs));
2527       PetscCall(ISGetBlockSize(iscol[i],&cbs));
2528 
2529       PetscCall(MatCreate(PETSC_COMM_SELF,submats+i));
2530       PetscCall(MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE));
2531 
2532       PetscCall(MatSetBlockSizes(submats[i],rbs,cbs));
2533       PetscCall(MatSetType(submats[i],((PetscObject)A)->type_name));
2534       PetscCall(MatSeqAIJSetPreallocation(submats[i],0,lens[i]));
2535 
2536       /* create struct Mat_SubSppt and attached it to submat */
2537       PetscCall(PetscNew(&smat_i));
2538       subc = (Mat_SeqAIJ*)submats[i]->data;
2539       subc->submatis1 = smat_i;
2540 
2541       smat_i->destroy          = submats[i]->ops->destroy;
2542       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2543       submats[i]->factortype   = C->factortype;
2544 
2545       smat_i->id          = i;
2546       smat_i->nrqs        = nrqs;
2547       smat_i->nrqr        = nrqr;
2548       smat_i->rbuf1       = rbuf1;
2549       smat_i->rbuf2       = rbuf2;
2550       smat_i->rbuf3       = rbuf3;
2551       smat_i->sbuf2       = sbuf2;
2552       smat_i->req_source2 = req_source2;
2553 
2554       smat_i->sbuf1       = sbuf1;
2555       smat_i->ptr         = ptr;
2556       smat_i->tmp         = tmp;
2557       smat_i->ctr         = ctr;
2558 
2559       smat_i->pa           = pa;
2560       smat_i->req_size     = req_size;
2561       smat_i->req_source1  = req_source1;
2562 
2563       smat_i->allcolumns  = allcolumns[i];
2564       smat_i->singleis    = PETSC_FALSE;
2565       smat_i->row2proc    = row2proc[i];
2566       smat_i->rmap        = rmap[i];
2567       smat_i->cmap        = cmap[i];
2568     }
2569 
2570     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2571       PetscCall(MatCreate(PETSC_COMM_SELF,&submats[0]));
2572       PetscCall(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE));
2573       PetscCall(MatSetType(submats[0],MATDUMMY));
2574 
2575       /* create struct Mat_SubSppt and attached it to submat */
2576       PetscCall(PetscNewLog(submats[0],&smat_i));
2577       submats[0]->data = (void*)smat_i;
2578 
2579       smat_i->destroy          = submats[0]->ops->destroy;
2580       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2581       submats[0]->factortype   = C->factortype;
2582 
2583       smat_i->id          = 0;
2584       smat_i->nrqs        = nrqs;
2585       smat_i->nrqr        = nrqr;
2586       smat_i->rbuf1       = rbuf1;
2587       smat_i->rbuf2       = rbuf2;
2588       smat_i->rbuf3       = rbuf3;
2589       smat_i->sbuf2       = sbuf2;
2590       smat_i->req_source2 = req_source2;
2591 
2592       smat_i->sbuf1       = sbuf1;
2593       smat_i->ptr         = ptr;
2594       smat_i->tmp         = tmp;
2595       smat_i->ctr         = ctr;
2596 
2597       smat_i->pa           = pa;
2598       smat_i->req_size     = req_size;
2599       smat_i->req_source1  = req_source1;
2600 
2601       smat_i->allcolumns  = PETSC_FALSE;
2602       smat_i->singleis    = PETSC_FALSE;
2603       smat_i->row2proc    = NULL;
2604       smat_i->rmap        = NULL;
2605       smat_i->cmap        = NULL;
2606     }
2607 
2608     if (ismax) PetscCall(PetscFree(lens[0]));
2609     PetscCall(PetscFree(lens));
2610     if (sbuf_aj) {
2611       PetscCall(PetscFree(sbuf_aj[0]));
2612       PetscCall(PetscFree(sbuf_aj));
2613     }
2614 
2615   } /* endof scall == MAT_INITIAL_MATRIX */
2616 
2617   /* Post recv matrix values */
2618   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4));
2619   PetscCall(PetscMalloc1(nrqs,&rbuf4));
2620   PetscCall(PetscMalloc1(nrqs,&r_waits4));
2621   for (i=0; i<nrqs; ++i) {
2622     PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i]));
2623     PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i));
2624   }
2625 
2626   /* Allocate sending buffers for a->a, and send them off */
2627   PetscCall(PetscMalloc1(nrqr,&sbuf_aa));
2628   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2629   if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0]));
2630   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2631 
2632   PetscCall(PetscMalloc1(nrqr,&s_waits4));
2633   {
2634     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2635     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2636     PetscInt    cend   = C->cmap->rend;
2637     PetscInt    *b_j   = b->j;
2638     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2639 
2640     PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a));
2641     PetscCall(MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a));
2642     for (i=0; i<nrqr; i++) {
2643       rbuf1_i   = rbuf1[i];
2644       sbuf_aa_i = sbuf_aa[i];
2645       ct1       = 2*rbuf1_i[0]+1;
2646       ct2       = 0;
2647       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2648         kmax = rbuf1_i[2*j];
2649         for (k=0; k<kmax; k++,ct1++) {
2650           row    = rbuf1_i[ct1] - rstart;
2651           nzA    = a_i[row+1] - a_i[row];
2652           nzB    = b_i[row+1] - b_i[row];
2653           ncols  = nzA + nzB;
2654           cworkB = b_j + b_i[row];
2655           vworkA = a_a + a_i[row];
2656           vworkB = b_a + b_i[row];
2657 
2658           /* load the column values for this row into vals*/
2659           vals = sbuf_aa_i+ct2;
2660 
2661           lwrite = 0;
2662           for (l=0; l<nzB; l++) {
2663             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2664           }
2665           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2666           for (l=0; l<nzB; l++) {
2667             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2668           }
2669 
2670           ct2 += ncols;
2671         }
2672       }
2673       PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i));
2674     }
2675     PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a));
2676     PetscCall(MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a));
2677   }
2678 
2679   /* Assemble the matrices */
2680   /* First assemble the local rows */
2681   for (i=0; i<ismax; i++) {
2682     row2proc_i = row2proc[i];
2683     subc      = (Mat_SeqAIJ*)submats[i]->data;
2684     imat_ilen = subc->ilen;
2685     imat_j    = subc->j;
2686     imat_i    = subc->i;
2687     imat_a    = subc->a;
2688 
2689     if (!allcolumns[i]) cmap_i = cmap[i];
2690     rmap_i = rmap[i];
2691     irow_i = irow[i];
2692     jmax   = nrow[i];
2693     for (j=0; j<jmax; j++) {
2694       row  = irow_i[j];
2695       proc = row2proc_i[j];
2696       if (proc == rank) {
2697         old_row = row;
2698 #if defined(PETSC_USE_CTABLE)
2699         PetscCall(PetscTableFind(rmap_i,row+1,&row));
2700         row--;
2701 #else
2702         row = rmap_i[row];
2703 #endif
2704         ilen_row = imat_ilen[row];
2705         PetscCall(MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals));
2706         mat_i    = imat_i[row];
2707         mat_a    = imat_a + mat_i;
2708         mat_j    = imat_j + mat_i;
2709         if (!allcolumns[i]) {
2710           for (k=0; k<ncols; k++) {
2711 #if defined(PETSC_USE_CTABLE)
2712             PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol));
2713 #else
2714             tcol = cmap_i[cols[k]];
2715 #endif
2716             if (tcol) {
2717               *mat_j++ = tcol - 1;
2718               *mat_a++ = vals[k];
2719               ilen_row++;
2720             }
2721           }
2722         } else { /* allcolumns */
2723           for (k=0; k<ncols; k++) {
2724             *mat_j++ = cols[k];  /* global col index! */
2725             *mat_a++ = vals[k];
2726             ilen_row++;
2727           }
2728         }
2729         PetscCall(MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals));
2730 
2731         imat_ilen[row] = ilen_row;
2732       }
2733     }
2734   }
2735 
2736   /* Now assemble the off proc rows */
2737   PetscCallMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE));
2738   for (tmp2=0; tmp2<nrqs; tmp2++) {
2739     sbuf1_i = sbuf1[pa[tmp2]];
2740     jmax    = sbuf1_i[0];
2741     ct1     = 2*jmax + 1;
2742     ct2     = 0;
2743     rbuf2_i = rbuf2[tmp2];
2744     rbuf3_i = rbuf3[tmp2];
2745     rbuf4_i = rbuf4[tmp2];
2746     for (j=1; j<=jmax; j++) {
2747       is_no     = sbuf1_i[2*j-1];
2748       rmap_i    = rmap[is_no];
2749       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2750       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2751       imat_ilen = subc->ilen;
2752       imat_j    = subc->j;
2753       imat_i    = subc->i;
2754       imat_a    = subc->a;
2755       max1      = sbuf1_i[2*j];
2756       for (k=0; k<max1; k++,ct1++) {
2757         row = sbuf1_i[ct1];
2758 #if defined(PETSC_USE_CTABLE)
2759         PetscCall(PetscTableFind(rmap_i,row+1,&row));
2760         row--;
2761 #else
2762         row = rmap_i[row];
2763 #endif
2764         ilen  = imat_ilen[row];
2765         mat_i = imat_i[row];
2766         mat_a = imat_a + mat_i;
2767         mat_j = imat_j + mat_i;
2768         max2  = rbuf2_i[ct1];
2769         if (!allcolumns[is_no]) {
2770           for (l=0; l<max2; l++,ct2++) {
2771 #if defined(PETSC_USE_CTABLE)
2772             PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
2773 #else
2774             tcol = cmap_i[rbuf3_i[ct2]];
2775 #endif
2776             if (tcol) {
2777               *mat_j++ = tcol - 1;
2778               *mat_a++ = rbuf4_i[ct2];
2779               ilen++;
2780             }
2781           }
2782         } else { /* allcolumns */
2783           for (l=0; l<max2; l++,ct2++) {
2784             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2785             *mat_a++ = rbuf4_i[ct2];
2786             ilen++;
2787           }
2788         }
2789         imat_ilen[row] = ilen;
2790       }
2791     }
2792   }
2793 
2794   if (!iscsorted) { /* sort column indices of the rows */
2795     for (i=0; i<ismax; i++) {
2796       subc      = (Mat_SeqAIJ*)submats[i]->data;
2797       imat_j    = subc->j;
2798       imat_i    = subc->i;
2799       imat_a    = subc->a;
2800       imat_ilen = subc->ilen;
2801 
2802       if (allcolumns[i]) continue;
2803       jmax = nrow[i];
2804       for (j=0; j<jmax; j++) {
2805         mat_i = imat_i[j];
2806         mat_a = imat_a + mat_i;
2807         mat_j = imat_j + mat_i;
2808         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a));
2809       }
2810     }
2811   }
2812 
2813   PetscCall(PetscFree(r_waits4));
2814   PetscCallMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE));
2815   PetscCall(PetscFree(s_waits4));
2816 
2817   /* Restore the indices */
2818   for (i=0; i<ismax; i++) {
2819     PetscCall(ISRestoreIndices(isrow[i],irow+i));
2820     if (!allcolumns[i]) {
2821       PetscCall(ISRestoreIndices(iscol[i],icol+i));
2822     }
2823   }
2824 
2825   for (i=0; i<ismax; i++) {
2826     PetscCall(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY));
2827     PetscCall(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY));
2828   }
2829 
2830   /* Destroy allocated memory */
2831   if (sbuf_aa) {
2832     PetscCall(PetscFree(sbuf_aa[0]));
2833     PetscCall(PetscFree(sbuf_aa));
2834   }
2835   PetscCall(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted));
2836 
2837   for (i=0; i<nrqs; ++i) {
2838     PetscCall(PetscFree(rbuf4[i]));
2839   }
2840   PetscCall(PetscFree(rbuf4));
2841 
2842   PetscCall(PetscFree4(row2proc,cmap,rmap,allcolumns));
2843   PetscFunctionReturn(0);
2844 }
2845 
2846 /*
2847  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2848  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2849  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2850  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2851  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2852  state, and needs to be "assembled" later by compressing B's column space.
2853 
2854  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2855  Following this call, C->A & C->B have been created, even if empty.
2856  */
2857 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2858 {
2859   /* If making this function public, change the error returned in this function away from _PLIB. */
2860   Mat_MPIAIJ     *aij;
2861   Mat_SeqAIJ     *Baij;
2862   PetscBool      seqaij,Bdisassembled;
2863   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2864   PetscScalar    v;
2865   const PetscInt *rowindices,*colindices;
2866 
2867   PetscFunctionBegin;
2868   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2869   if (A) {
2870     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij));
2871     PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2872     if (rowemb) {
2873       PetscCall(ISGetLocalSize(rowemb,&m));
2874       PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2875     } else {
2876       if (C->rmap->n != A->rmap->n) {
2877         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2878       }
2879     }
2880     if (dcolemb) {
2881       PetscCall(ISGetLocalSize(dcolemb,&n));
2882       PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n);
2883     } else {
2884       PetscCheck(C->cmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2885     }
2886   }
2887   if (B) {
2888     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij));
2889     PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2890     if (rowemb) {
2891       PetscCall(ISGetLocalSize(rowemb,&m));
2892       PetscCheck(m == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2893     } else {
2894       if (C->rmap->n != B->rmap->n) {
2895         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2896       }
2897     }
2898     if (ocolemb) {
2899       PetscCall(ISGetLocalSize(ocolemb,&n));
2900       PetscCheck(n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n);
2901     } else {
2902       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2903     }
2904   }
2905 
2906   aij = (Mat_MPIAIJ*)C->data;
2907   if (!aij->A) {
2908     /* Mimic parts of MatMPIAIJSetPreallocation() */
2909     PetscCall(MatCreate(PETSC_COMM_SELF,&aij->A));
2910     PetscCall(MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n));
2911     PetscCall(MatSetBlockSizesFromMats(aij->A,C,C));
2912     PetscCall(MatSetType(aij->A,MATSEQAIJ));
2913     PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A));
2914   }
2915   if (A) {
2916     PetscCall(MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A));
2917   } else {
2918     PetscCall(MatSetUp(aij->A));
2919   }
2920   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2921     /*
2922       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2923       need to "disassemble" B -- convert it to using C's global indices.
2924       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2925 
2926       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2927       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2928 
2929       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2930       At least avoid calling MatSetValues() and the implied searches?
2931     */
2932 
2933     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2934 #if defined(PETSC_USE_CTABLE)
2935       PetscCall(PetscTableDestroy(&aij->colmap));
2936 #else
2937       PetscCall(PetscFree(aij->colmap));
2938       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2939       if (aij->B) PetscCall(PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt)));
2940 #endif
2941       ngcol = 0;
2942       if (aij->lvec) {
2943         PetscCall(VecGetSize(aij->lvec,&ngcol));
2944       }
2945       if (aij->garray) {
2946         PetscCall(PetscFree(aij->garray));
2947         PetscCall(PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt)));
2948       }
2949       PetscCall(VecDestroy(&aij->lvec));
2950       PetscCall(VecScatterDestroy(&aij->Mvctx));
2951     }
2952     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2953       PetscCall(MatDestroy(&aij->B));
2954     }
2955     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2956       PetscCall(MatZeroEntries(aij->B));
2957     }
2958   }
2959   Bdisassembled = PETSC_FALSE;
2960   if (!aij->B) {
2961     PetscCall(MatCreate(PETSC_COMM_SELF,&aij->B));
2962     PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B));
2963     PetscCall(MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N));
2964     PetscCall(MatSetBlockSizesFromMats(aij->B,B,B));
2965     PetscCall(MatSetType(aij->B,MATSEQAIJ));
2966     Bdisassembled = PETSC_TRUE;
2967   }
2968   if (B) {
2969     Baij = (Mat_SeqAIJ*)B->data;
2970     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2971       PetscCall(PetscMalloc1(B->rmap->n,&nz));
2972       for (i=0; i<B->rmap->n; i++) {
2973         nz[i] = Baij->i[i+1] - Baij->i[i];
2974       }
2975       PetscCall(MatSeqAIJSetPreallocation(aij->B,0,nz));
2976       PetscCall(PetscFree(nz));
2977     }
2978 
2979     PetscCall(PetscLayoutGetRange(C->rmap,&rstart,&rend));
2980     shift = rend-rstart;
2981     count = 0;
2982     rowindices = NULL;
2983     colindices = NULL;
2984     if (rowemb) {
2985       PetscCall(ISGetIndices(rowemb,&rowindices));
2986     }
2987     if (ocolemb) {
2988       PetscCall(ISGetIndices(ocolemb,&colindices));
2989     }
2990     for (i=0; i<B->rmap->n; i++) {
2991       PetscInt row;
2992       row = i;
2993       if (rowindices) row = rowindices[i];
2994       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2995         col  = Baij->j[count];
2996         if (colindices) col = colindices[col];
2997         if (Bdisassembled && col>=rstart) col += shift;
2998         v    = Baij->a[count];
2999         PetscCall(MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES));
3000         ++count;
3001       }
3002     }
3003     /* No assembly for aij->B is necessary. */
3004     /* FIXME: set aij->B's nonzerostate correctly. */
3005   } else {
3006     PetscCall(MatSetUp(aij->B));
3007   }
3008   C->preallocated  = PETSC_TRUE;
3009   C->was_assembled = PETSC_FALSE;
3010   C->assembled     = PETSC_FALSE;
3011    /*
3012       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3013       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3014    */
3015   PetscFunctionReturn(0);
3016 }
3017 
3018 /*
3019   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3020  */
3021 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3022 {
3023   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3024 
3025   PetscFunctionBegin;
3026   PetscValidPointer(A,2);
3027   PetscValidPointer(B,3);
3028   /* FIXME: make sure C is assembled */
3029   *A = aij->A;
3030   *B = aij->B;
3031   /* Note that we don't incref *A and *B, so be careful! */
3032   PetscFunctionReturn(0);
3033 }
3034 
3035 /*
3036   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3037   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3038 */
3039 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3040                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3041                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3042                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3043                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3044 {
3045   PetscMPIInt    size,flag;
3046   PetscInt       i,ii,cismax,ispar;
3047   Mat            *A,*B;
3048   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3049 
3050   PetscFunctionBegin;
3051   if (!ismax) PetscFunctionReturn(0);
3052 
3053   for (i = 0, cismax = 0; i < ismax; ++i) {
3054     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag));
3055     PetscCheck(flag == MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3056     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3057     if (size > 1) ++cismax;
3058   }
3059 
3060   /*
3061      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3062      ispar counts the number of parallel ISs across C's comm.
3063   */
3064   PetscCall(MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
3065   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3066     PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat));
3067     PetscFunctionReturn(0);
3068   }
3069 
3070   /* if (ispar) */
3071   /*
3072     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3073     These are used to extract the off-diag portion of the resulting parallel matrix.
3074     The row IS for the off-diag portion is the same as for the diag portion,
3075     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3076   */
3077   PetscCall(PetscMalloc2(cismax,&cisrow,cismax,&ciscol));
3078   PetscCall(PetscMalloc1(cismax,&ciscol_p));
3079   for (i = 0, ii = 0; i < ismax; ++i) {
3080     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3081     if (size > 1) {
3082       /*
3083          TODO: This is the part that's ***NOT SCALABLE***.
3084          To fix this we need to extract just the indices of C's nonzero columns
3085          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3086          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3087          to be done without serializing on the IS list, so, most likely, it is best
3088          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3089       */
3090       PetscCall(ISGetNonlocalIS(iscol[i],&(ciscol[ii])));
3091       /* Now we have to
3092          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3093              were sorted on each rank, concatenated they might no longer be sorted;
3094          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3095              indices in the nondecreasing order to the original index positions.
3096          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3097       */
3098       PetscCall(ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii));
3099       PetscCall(ISSort(ciscol[ii]));
3100       ++ii;
3101     }
3102   }
3103   PetscCall(PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p));
3104   for (i = 0, ii = 0; i < ismax; ++i) {
3105     PetscInt       j,issize;
3106     const PetscInt *indices;
3107 
3108     /*
3109        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3110      */
3111     PetscCall(ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i));
3112     PetscCall(ISSort(isrow[i]));
3113     PetscCall(ISGetLocalSize(isrow[i],&issize));
3114     PetscCall(ISGetIndices(isrow[i],&indices));
3115     for (j = 1; j < issize; ++j) {
3116       PetscCheck(indices[j] != indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3117     }
3118     PetscCall(ISRestoreIndices(isrow[i],&indices));
3119     PetscCall(ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i));
3120     PetscCall(ISSort(iscol[i]));
3121     PetscCall(ISGetLocalSize(iscol[i],&issize));
3122     PetscCall(ISGetIndices(iscol[i],&indices));
3123     for (j = 1; j < issize; ++j) {
3124       PetscCheck(indices[j-1] != indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3125     }
3126     PetscCall(ISRestoreIndices(iscol[i],&indices));
3127     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3128     if (size > 1) {
3129       cisrow[ii] = isrow[i];
3130       ++ii;
3131     }
3132   }
3133   /*
3134     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3135     array of sequential matrices underlying the resulting parallel matrices.
3136     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3137     contain duplicates.
3138 
3139     There are as many diag matrices as there are original index sets. There are only as many parallel
3140     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3141 
3142     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3143     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3144       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3145       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3146       with A[i] and B[ii] extracted from the corresponding MPI submat.
3147     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3148       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3149       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3150       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3151       values into A[i] and B[ii] sitting inside the corresponding submat.
3152     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3153       A[i], B[ii], AA[i] or BB[ii] matrices.
3154   */
3155   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3156   if (scall == MAT_INITIAL_MATRIX) {
3157     PetscCall(PetscMalloc1(ismax,submat));
3158   }
3159 
3160   /* Now obtain the sequential A and B submatrices separately. */
3161   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3162   PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A));
3163   PetscCall((*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B));
3164 
3165   /*
3166     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3167     matrices A & B have been extracted directly into the parallel matrices containing them, or
3168     simply into the sequential matrix identical with the corresponding A (if size == 1).
3169     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3170     to have the same sparsity pattern.
3171     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3172     must be constructed for C. This is done by setseqmat(s).
3173   */
3174   for (i = 0, ii = 0; i < ismax; ++i) {
3175     /*
3176        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3177        That way we can avoid sorting and computing permutations when reusing.
3178        To this end:
3179         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3180         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3181           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3182     */
3183     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3184 
3185     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3186     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3187     if (size > 1) {
3188       if (scall == MAT_INITIAL_MATRIX) {
3189         PetscCall(MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i));
3190         PetscCall(MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE));
3191         PetscCall(MatSetType((*submat)[i],MATMPIAIJ));
3192         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3193         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3194       }
3195       /*
3196         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3197       */
3198       {
3199         Mat AA = A[i],BB = B[ii];
3200 
3201         if (AA || BB) {
3202           PetscCall(setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB));
3203           PetscCall(MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY));
3204           PetscCall(MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY));
3205         }
3206         PetscCall(MatDestroy(&AA));
3207       }
3208       PetscCall(ISDestroy(ciscol+ii));
3209       PetscCall(ISDestroy(ciscol_p+ii));
3210       ++ii;
3211     } else { /* if (size == 1) */
3212       if (scall == MAT_REUSE_MATRIX) {
3213         PetscCall(MatDestroy(&(*submat)[i]));
3214       }
3215       if (isrow_p[i] || iscol_p[i]) {
3216         PetscCall(MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i));
3217         PetscCall(setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]));
3218         /* Otherwise A is extracted straight into (*submats)[i]. */
3219         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3220         PetscCall(MatDestroy(A+i));
3221       } else (*submat)[i] = A[i];
3222     }
3223     PetscCall(ISDestroy(&isrow_p[i]));
3224     PetscCall(ISDestroy(&iscol_p[i]));
3225   }
3226   PetscCall(PetscFree2(cisrow,ciscol));
3227   PetscCall(PetscFree2(isrow_p,iscol_p));
3228   PetscCall(PetscFree(ciscol_p));
3229   PetscCall(PetscFree(A));
3230   PetscCall(MatDestroySubMatrices(cismax,&B));
3231   PetscFunctionReturn(0);
3232 }
3233 
3234 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3235 {
3236   PetscFunctionBegin;
3237   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ));
3238   PetscFunctionReturn(0);
3239 }
3240