xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 70a7d78aacfbd24b2e31399a3d8e056944bb7de3)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscErrorCode ierr;
24   PetscInt       i;
25 
26   PetscFunctionBegin;
27   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28   for (i=0; i<ov; ++i) {
29     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
30   }
31   PetscFunctionReturn(0);
32 }
33 
34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
35 {
36   PetscErrorCode ierr;
37   PetscInt       i;
38 
39   PetscFunctionBegin;
40   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
41   for (i=0; i<ov; ++i) {
42     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
43   }
44   PetscFunctionReturn(0);
45 }
46 
47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
48 {
49   PetscErrorCode ierr;
50   MPI_Comm       comm;
51   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
52   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
53   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
54   const PetscInt *indices_i,**indices;
55   PetscLayout    rmap;
56   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
57   PetscSF        sf;
58   PetscSFNode    *remote;
59 
60   PetscFunctionBegin;
61   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
62   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
63   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
64   /* get row map to determine where rows should be going */
65   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
66   /* retrieve IS data and put all together so that we
67    * can optimize communication
68    *  */
69   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
70   for (i=0,tlength=0; i<nidx; i++) {
71     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
72     tlength += length[i];
73     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
74   }
75   /* find these rows on remote processors */
76   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
77   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
78   nrrows = 0;
79   for (i=0; i<nidx; i++) {
80     length_i  = length[i];
81     indices_i = indices[i];
82     for (j=0; j<length_i; j++) {
83       owner = -1;
84       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
85       /* remote processors */
86       if (owner != rank) {
87         tosizes_temp[owner]++; /* number of rows to owner */
88         rrow_ranks[nrrows]  = owner; /* processor */
89         rrow_isids[nrrows]   = i; /* is id */
90         remoterows[nrrows++] = indices_i[j]; /* row */
91       }
92     }
93     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
94   }
95   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
96   /* test if we need to exchange messages
97    * generally speaking, we do not need to exchange
98    * data when overlap is 1
99    * */
100   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr);
101   /* we do not have any messages
102    * It usually corresponds to overlap 1
103    * */
104   if (!reducednrrows) {
105     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
106     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
107     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
108     PetscFunctionReturn(0);
109   }
110   nto = 0;
111   /* send sizes and ranks for building a two-sided communcation */
112   for (i=0; i<size; i++) {
113     if (tosizes_temp[i]) {
114       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
115       tosizes_temp[i] = nto; /* a map from processor to index */
116       toranks[nto++]  = i; /* processor */
117     }
118   }
119   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
120   toffsets[0] = 0;
121   for (i=0; i<nto; i++) {
122     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
123     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
124   }
125   /* send information to other processors */
126   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
130   nrecvrows = 0;
131   for (i=0; i<nfrom; i++) {
132     for (j=0; j<fromsizes[2*i]; j++) {
133       remote[nrecvrows].rank    = fromranks[i];
134       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135     }
136   }
137   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
138   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
139   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
140   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
141   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
142   /* message pair <no of is, row>  */
143   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
144   for (i=0; i<nrrows; i++) {
145     owner = rrow_ranks[i]; /* processor */
146     j     = tosizes_temp[owner]; /* index */
147     todata[toffsets[j]++] = rrow_isids[i];
148     todata[toffsets[j]++] = remoterows[i];
149   }
150   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
151   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
152   ierr = PetscFree(toffsets);CHKERRQ(ierr);
153   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
154   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
155   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
156   /* send rows belonging to the remote so that then we could get the overlapping data back */
157   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
158   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
159   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
160   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
161   ierr = PetscFree(fromranks);CHKERRQ(ierr);
162   nrecvrows = 0;
163   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
164   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
165   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
166   nrecvrows = 0;
167   for (i=0; i<nto; i++) {
168     for (j=0; j<tosizes[2*i]; j++) {
169       remote[nrecvrows].rank    = toranks[i];
170       remote[nrecvrows++].index = tosizes[2*i+1]+j;
171     }
172   }
173   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
174   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
175   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
177   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
178   /* overlap communication and computation */
179   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
180   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
181   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
182   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
183   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
184   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
185   ierr = PetscFree(toranks);CHKERRQ(ierr);
186   ierr = PetscFree(tosizes);CHKERRQ(ierr);
187   ierr = PetscFree(todata);CHKERRQ(ierr);
188   PetscFunctionReturn(0);
189 }
190 
191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
192 {
193   PetscInt         *isz,isz_i,i,j,is_id, data_size;
194   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
195   const PetscInt   *indices_i_temp;
196   MPI_Comm         *iscomms;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++) {
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++) {
209     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
210     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
211     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
212     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
214   }
215   /* retrieve information to get row id and its overlap */
216   for (i=0; i<nrecvs;) {
217     is_id     = recvdata[i++];
218     data_size = recvdata[i++];
219     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
220     isz_i     = isz[is_id];
221     for (j=0; j< data_size; j++) {
222       col = recvdata[i++];
223       indices_i[isz_i++] = col;
224     }
225     isz[is_id] = isz_i;
226   }
227   /* remove duplicate entities */
228   for (i=0; i<nidx; i++) {
229     indices_i = indices_temp+(max_lsize+nrecvs)*i;
230     isz_i     = isz[i];
231     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
232     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
233     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
234   }
235   ierr = PetscFree(isz);CHKERRQ(ierr);
236   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
237   PetscFunctionReturn(0);
238 }
239 
240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
241 {
242   PetscLayout       rmap,cmap;
243   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
244   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
245   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
246   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
247   Mat               amat,bmat;
248   PetscMPIInt       rank;
249   PetscBool         done;
250   MPI_Comm          comm;
251   PetscErrorCode    ierr;
252 
253   PetscFunctionBegin;
254   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
255   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
256   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
257   /* Even if the mat is symmetric, we still assume it is not symmetric */
258   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   /* total number of nonzero values is used to estimate the memory usage in the next step */
263   tnz  = ai[an]+bi[bn];
264   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
265   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
267   /* to find the longest message */
268   max_fszs = 0;
269   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
270   /* better way to estimate number of nonzero in the mat??? */
271   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
272   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
273   rows_pos  = 0;
274   totalrows = 0;
275   for (i=0; i<nfrom; i++) {
276     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
277     /* group data together */
278     for (j=0; j<fromsizes[2*i]; j+=2) {
279       is_id                       = fromrows[rows_pos++];/* no of is */
280       rows_i                      = rows_data[is_id];
281       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
282     }
283     /* estimate a space to avoid multiple allocations  */
284     for (j=0; j<nidx; j++) {
285       indvc_ij = 0;
286       rows_i   = rows_data[j];
287       for (l=0; l<rows_pos_i[j]; l++) {
288         row    = rows_i[l]-rstart;
289         start  = ai[row];
290         end    = ai[row+1];
291         for (k=start; k<end; k++) { /* Amat */
292           col = aj[k] + cstart;
293           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
294         }
295         start = bi[row];
296         end   = bi[row+1];
297         for (k=start; k<end; k++) { /* Bmat */
298           col = gcols[bj[k]];
299           indices_tmp[indvc_ij++] = col;
300         }
301       }
302       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
303       indv_counts[i*nidx+j] = indvc_ij;
304       totalrows            += indvc_ij;
305     }
306   }
307   /* message triple <no of is, number of rows, rows> */
308   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
309   totalrows = 0;
310   rows_pos  = 0;
311   /* use this code again */
312   for (i=0;i<nfrom;i++) {
313     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
314     for (j=0; j<fromsizes[2*i]; j+=2) {
315       is_id                       = fromrows[rows_pos++];
316       rows_i                      = rows_data[is_id];
317       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
318     }
319     /* add data  */
320     for (j=0; j<nidx; j++) {
321       if (!indv_counts[i*nidx+j]) continue;
322       indvc_ij = 0;
323       sbdata[totalrows++] = j;
324       sbdata[totalrows++] = indv_counts[i*nidx+j];
325       sbsizes[2*i]       += 2;
326       rows_i              = rows_data[j];
327       for (l=0; l<rows_pos_i[j]; l++) {
328         row   = rows_i[l]-rstart;
329         start = ai[row];
330         end   = ai[row+1];
331         for (k=start; k<end; k++) { /* Amat */
332           col = aj[k] + cstart;
333           indices_tmp[indvc_ij++] = col;
334         }
335         start = bi[row];
336         end   = bi[row+1];
337         for (k=start; k<end; k++) { /* Bmat */
338           col = gcols[bj[k]];
339           indices_tmp[indvc_ij++] = col;
340         }
341       }
342       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
343       sbsizes[2*i]  += indvc_ij;
344       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
345       totalrows += indvc_ij;
346     }
347   }
348   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
349   offsets[0] = 0;
350   for (i=0; i<nfrom; i++) {
351     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352     sbsizes[2*i+1] = offsets[i];
353   }
354   ierr = PetscFree(offsets);CHKERRQ(ierr);
355   if (sbrowsizes) *sbrowsizes = sbsizes;
356   if (sbrows) *sbrows = sbdata;
357   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
359   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364 {
365   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367   PetscInt          lsize,lsize_tmp;
368   PetscMPIInt       rank,owner;
369   Mat               amat,bmat;
370   PetscBool         done;
371   PetscLayout       cmap,rmap;
372   MPI_Comm          comm;
373   PetscErrorCode    ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
377   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
378   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
379   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
380   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   /* is it a safe way to compute number of nonzero values ? */
384   tnz  = ai[an]+bi[bn];
385   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
387   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
388   /* it is a better way to estimate memory than the old implementation
389    * where global size of matrix is used
390    * */
391   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
392   for (i=0; i<nidx; i++) {
393     MPI_Comm iscomm;
394 
395     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
396     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
397     lsize_tmp = 0;
398     for (j=0; j<lsize; j++) {
399       owner = -1;
400       row   = indices[j];
401       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
402       if (owner != rank) continue;
403       /* local number */
404       row  -= rstart;
405       start = ai[row];
406       end   = ai[row+1];
407       for (k=start; k<end; k++) { /* Amat */
408         col = aj[k] + cstart;
409         indices_temp[lsize_tmp++] = col;
410       }
411       start = bi[row];
412       end   = bi[row+1];
413       for (k=start; k<end; k++) { /* Bmat */
414         col = gcols[bj[k]];
415         indices_temp[lsize_tmp++] = col;
416       }
417     }
418    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
419    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
420    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
421    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
422    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
423    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
424   }
425   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
426   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
427   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 /*
432   Sample message format:
433   If a processor A wants processor B to process some elements corresponding
434   to index sets is[1],is[5]
435   mesg [0] = 2   (no of index sets in the mesg)
436   -----------
437   mesg [1] = 1 => is[1]
438   mesg [2] = sizeof(is[1]);
439   -----------
440   mesg [3] = 5  => is[5]
441   mesg [4] = sizeof(is[5]);
442   -----------
443   mesg [5]
444   mesg [n]  datas[1]
445   -----------
446   mesg[n+1]
447   mesg[m]  data(is[5])
448   -----------
449 
450   Notes:
451   nrqs - no of requests sent (or to be sent out)
452   nrqr - no of requests received (which have to be or which have been processed)
453 */
454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
455 {
456   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
457   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
458   const PetscInt **idx,*idx_i;
459   PetscInt       *n,**data,len;
460 #if defined(PETSC_USE_CTABLE)
461   PetscTable     *table_data,table_data_i;
462   PetscInt       *tdata,tcount,tcount_max;
463 #else
464   PetscInt       *data_i,*d_p;
465 #endif
466   PetscErrorCode ierr;
467   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
468   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
469   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
470   PetscBT        *table;
471   MPI_Comm       comm;
472   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
473   MPI_Status     *s_status,*recv_status;
474   MPI_Comm       *iscomms;
475   char           *t_p;
476 
477   PetscFunctionBegin;
478   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
479   size = c->size;
480   rank = c->rank;
481   M    = C->rmap->N;
482 
483   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
485 
486   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
487 
488   for (i=0; i<imax; i++) {
489     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
490     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
491   }
492 
493   /* evaluate communication - mesg to who,length of mesg, and buffer space
494      required. Based on this, buffers are allocated, and data copied into them  */
495   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
496   for (i=0; i<imax; i++) {
497     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
498     idx_i = idx[i];
499     len   = n[i];
500     for (j=0; j<len; j++) {
501       row = idx_i[j];
502       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
503       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
504       w4[proc]++;
505     }
506     for (j=0; j<size; j++) {
507       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
508     }
509   }
510 
511   nrqs     = 0;              /* no of outgoing messages */
512   msz      = 0;              /* total mesg length (for all proc */
513   w1[rank] = 0;              /* no mesg sent to intself */
514   w3[rank] = 0;
515   for (i=0; i<size; i++) {
516     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
517   }
518   /* pa - is list of processors to communicate with */
519   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
520   for (i=0,j=0; i<size; i++) {
521     if (w1[i]) {pa[j] = i; j++;}
522   }
523 
524   /* Each message would have a header = 1 + 2*(no of IS) + data */
525   for (i=0; i<nrqs; i++) {
526     j      = pa[i];
527     w1[j] += w2[j] + 2*w3[j];
528     msz   += w1[j];
529   }
530 
531   /* Determine the number of messages to expect, their lengths, from from-ids */
532   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
533   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
534 
535   /* Now post the Irecvs corresponding to these messages */
536   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
537 
538   /* Allocate Memory for outgoing messages */
539   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
540   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
541   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
542 
543   {
544     PetscInt *iptr = tmp,ict  = 0;
545     for (i=0; i<nrqs; i++) {
546       j         = pa[i];
547       iptr     +=  ict;
548       outdat[j] = iptr;
549       ict       = w1[j];
550     }
551   }
552 
553   /* Form the outgoing messages */
554   /* plug in the headers */
555   for (i=0; i<nrqs; i++) {
556     j            = pa[i];
557     outdat[j][0] = 0;
558     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
559     ptr[j]       = outdat[j] + 2*w3[j] + 1;
560   }
561 
562   /* Memory for doing local proc's work */
563   {
564     PetscInt M_BPB_imax = 0;
565 #if defined(PETSC_USE_CTABLE)
566     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
567     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
570     }
571     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
572     for (i=0; i<imax; i++) {
573       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
574     }
575 #else
576     PetscInt Mimax = 0;
577     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
578     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
579     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
580     for (i=0; i<imax; i++) {
581       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582       data[i]  = d_p + M*i;
583     }
584 #endif
585   }
586 
587   /* Parse the IS and update local tables and the outgoing buf with the data */
588   {
589     PetscInt n_i,isz_i,*outdat_j,ctr_j;
590     PetscBT  table_i;
591 
592     for (i=0; i<imax; i++) {
593       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
594       n_i     = n[i];
595       table_i = table[i];
596       idx_i   = idx[i];
597 #if defined(PETSC_USE_CTABLE)
598       table_data_i = table_data[i];
599 #else
600       data_i  = data[i];
601 #endif
602       isz_i   = isz[i];
603       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
604         row  = idx_i[j];
605         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
606         if (proc != rank) { /* copy to the outgoing buffer */
607           ctr[proc]++;
608           *ptr[proc] = row;
609           ptr[proc]++;
610         } else if (!PetscBTLookupSet(table_i,row)) {
611 #if defined(PETSC_USE_CTABLE)
612           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
613 #else
614           data_i[isz_i] = row; /* Update the local table */
615 #endif
616           isz_i++;
617         }
618       }
619       /* Update the headers for the current IS */
620       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
621         if ((ctr_j = ctr[j])) {
622           outdat_j        = outdat[j];
623           k               = ++outdat_j[0];
624           outdat_j[2*k]   = ctr_j;
625           outdat_j[2*k-1] = i;
626         }
627       }
628       isz[i] = isz_i;
629     }
630   }
631 
632   /*  Now  post the sends */
633   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
634   for (i=0; i<nrqs; ++i) {
635     j    = pa[i];
636     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
637   }
638 
639   /* No longer need the original indices */
640   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
641   for (i=0; i<imax; ++i) {
642     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
643     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
644   }
645   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
646 
647   for (i=0; i<imax; ++i) {
648     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
649   }
650 
651   /* Do Local work */
652 #if defined(PETSC_USE_CTABLE)
653   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
654 #else
655   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
656 #endif
657 
658   /* Receive messages */
659   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
660   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);}
661 
662   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
663   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRMPI(ierr);}
664 
665   /* Phase 1 sends are complete - deallocate buffers */
666   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
667   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
668 
669   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
670   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
671   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
672   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
673   ierr = PetscFree(rbuf);CHKERRQ(ierr);
674 
675   /* Send the data back */
676   /* Do a global reduction to know the buffer space req for incoming messages */
677   {
678     PetscMPIInt *rw1;
679 
680     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
681 
682     for (i=0; i<nrqr; ++i) {
683       proc = recv_status[i].MPI_SOURCE;
684 
685       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
686       rw1[proc] = isz1[i];
687     }
688     ierr = PetscFree(onodes1);CHKERRQ(ierr);
689     ierr = PetscFree(olengths1);CHKERRQ(ierr);
690 
691     /* Determine the number of messages to expect, their lengths, from from-ids */
692     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
693     ierr = PetscFree(rw1);CHKERRQ(ierr);
694   }
695   /* Now post the Irecvs corresponding to these messages */
696   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
697 
698   /* Now  post the sends */
699   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
700   for (i=0; i<nrqr; ++i) {
701     j    = recv_status[i].MPI_SOURCE;
702     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr);
703   }
704 
705   /* receive work done on other processors */
706   {
707     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
708     PetscMPIInt idex;
709     PetscBT     table_i;
710     MPI_Status  *status2;
711 
712     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
713     for (i=0; i<nrqs; ++i) {
714       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRMPI(ierr);
715       /* Process the message */
716       rbuf2_i = rbuf2[idex];
717       ct1     = 2*rbuf2_i[0]+1;
718       jmax    = rbuf2[idex][0];
719       for (j=1; j<=jmax; j++) {
720         max     = rbuf2_i[2*j];
721         is_no   = rbuf2_i[2*j-1];
722         isz_i   = isz[is_no];
723         table_i = table[is_no];
724 #if defined(PETSC_USE_CTABLE)
725         table_data_i = table_data[is_no];
726 #else
727         data_i  = data[is_no];
728 #endif
729         for (k=0; k<max; k++,ct1++) {
730           row = rbuf2_i[ct1];
731           if (!PetscBTLookupSet(table_i,row)) {
732 #if defined(PETSC_USE_CTABLE)
733             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
734 #else
735             data_i[isz_i] = row;
736 #endif
737             isz_i++;
738           }
739         }
740         isz[is_no] = isz_i;
741       }
742     }
743 
744     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRMPI(ierr);}
745     ierr = PetscFree(status2);CHKERRQ(ierr);
746   }
747 
748 #if defined(PETSC_USE_CTABLE)
749   tcount_max = 0;
750   for (i=0; i<imax; ++i) {
751     table_data_i = table_data[i];
752     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
753     if (tcount_max < tcount) tcount_max = tcount;
754   }
755   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
756 #endif
757 
758   for (i=0; i<imax; ++i) {
759 #if defined(PETSC_USE_CTABLE)
760     PetscTablePosition tpos;
761     table_data_i = table_data[i];
762 
763     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
764     while (tpos) {
765       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
766       tdata[--j] = --k;
767     }
768     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
769 #else
770     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
771 #endif
772     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
773   }
774 
775   ierr = PetscFree(iscomms);CHKERRQ(ierr);
776   ierr = PetscFree(onodes2);CHKERRQ(ierr);
777   ierr = PetscFree(olengths2);CHKERRQ(ierr);
778 
779   ierr = PetscFree(pa);CHKERRQ(ierr);
780   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
781   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
782   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
783   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
784   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
785   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
786   ierr = PetscFree(s_status);CHKERRQ(ierr);
787   ierr = PetscFree(recv_status);CHKERRQ(ierr);
788   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
789   ierr = PetscFree(xdata);CHKERRQ(ierr);
790   ierr = PetscFree(isz1);CHKERRQ(ierr);
791 #if defined(PETSC_USE_CTABLE)
792   for (i=0; i<imax; i++) {
793     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
794   }
795   ierr = PetscFree(table_data);CHKERRQ(ierr);
796   ierr = PetscFree(tdata);CHKERRQ(ierr);
797   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
798 #else
799   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
800 #endif
801   PetscFunctionReturn(0);
802 }
803 
804 /*
805    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
806        the work on the local processor.
807 
808      Inputs:
809       C      - MAT_MPIAIJ;
810       imax - total no of index sets processed at a time;
811       table  - an array of char - size = m bits.
812 
813      Output:
814       isz    - array containing the count of the solution elements corresponding
815                to each index set;
816       data or table_data  - pointer to the solutions
817 */
818 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
819 {
820   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
821   Mat        A  = c->A,B = c->B;
822   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
823   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
824   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
825   PetscBT    table_i;
826 #if defined(PETSC_USE_CTABLE)
827   PetscTable         table_data_i;
828   PetscErrorCode     ierr;
829   PetscTablePosition tpos;
830   PetscInt           tcount,*tdata;
831 #else
832   PetscInt           *data_i;
833 #endif
834 
835   PetscFunctionBegin;
836   rstart = C->rmap->rstart;
837   cstart = C->cmap->rstart;
838   ai     = a->i;
839   aj     = a->j;
840   bi     = b->i;
841   bj     = b->j;
842   garray = c->garray;
843 
844   for (i=0; i<imax; i++) {
845 #if defined(PETSC_USE_CTABLE)
846     /* copy existing entries of table_data_i into tdata[] */
847     table_data_i = table_data[i];
848     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
849     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]);
850 
851     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
852     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
853     while (tpos) {
854       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
855       tdata[--j] = --row;
856       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount);
857     }
858 #else
859     data_i  = data[i];
860 #endif
861     table_i = table[i];
862     isz_i   = isz[i];
863     max     = isz[i];
864 
865     for (j=0; j<max; j++) {
866 #if defined(PETSC_USE_CTABLE)
867       row   = tdata[j] - rstart;
868 #else
869       row   = data_i[j] - rstart;
870 #endif
871       start = ai[row];
872       end   = ai[row+1];
873       for (k=start; k<end; k++) { /* Amat */
874         val = aj[k] + cstart;
875         if (!PetscBTLookupSet(table_i,val)) {
876 #if defined(PETSC_USE_CTABLE)
877           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
878 #else
879           data_i[isz_i] = val;
880 #endif
881           isz_i++;
882         }
883       }
884       start = bi[row];
885       end   = bi[row+1];
886       for (k=start; k<end; k++) { /* Bmat */
887         val = garray[bj[k]];
888         if (!PetscBTLookupSet(table_i,val)) {
889 #if defined(PETSC_USE_CTABLE)
890           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
891 #else
892           data_i[isz_i] = val;
893 #endif
894           isz_i++;
895         }
896       }
897     }
898     isz[i] = isz_i;
899 
900 #if defined(PETSC_USE_CTABLE)
901     ierr = PetscFree(tdata);CHKERRQ(ierr);
902 #endif
903   }
904   PetscFunctionReturn(0);
905 }
906 
907 /*
908       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
909          and return the output
910 
911          Input:
912            C    - the matrix
913            nrqr - no of messages being processed.
914            rbuf - an array of pointers to the received requests
915 
916          Output:
917            xdata - array of messages to be sent back
918            isz1  - size of each message
919 
920   For better efficiency perhaps we should malloc separately each xdata[i],
921 then if a remalloc is required we need only copy the data for that one row
922 rather then all previous rows as it is now where a single large chunck of
923 memory is used.
924 
925 */
926 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
927 {
928   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
929   Mat            A  = c->A,B = c->B;
930   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
931   PetscErrorCode ierr;
932   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
933   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
934   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
935   PetscInt       *rbuf_i,kmax,rbuf_0;
936   PetscBT        xtable;
937 
938   PetscFunctionBegin;
939   m      = C->rmap->N;
940   rstart = C->rmap->rstart;
941   cstart = C->cmap->rstart;
942   ai     = a->i;
943   aj     = a->j;
944   bi     = b->i;
945   bj     = b->j;
946   garray = c->garray;
947 
948   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
949     rbuf_i =  rbuf[i];
950     rbuf_0 =  rbuf_i[0];
951     ct    += rbuf_0;
952     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
953   }
954 
955   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
956   else max1 = 1;
957   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
958   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
959   ++no_malloc;
960   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
961   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
962 
963   ct3 = 0;
964   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
965     rbuf_i =  rbuf[i];
966     rbuf_0 =  rbuf_i[0];
967     ct1    =  2*rbuf_0+1;
968     ct2    =  ct1;
969     ct3   += ct1;
970     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
971       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
972       oct2 = ct2;
973       kmax = rbuf_i[2*j];
974       for (k=0; k<kmax; k++,ct1++) {
975         row = rbuf_i[ct1];
976         if (!PetscBTLookupSet(xtable,row)) {
977           if (!(ct3 < mem_estimate)) {
978             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
979             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
980             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
981             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
982             xdata[0]     = tmp;
983             mem_estimate = new_estimate; ++no_malloc;
984             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
985           }
986           xdata[i][ct2++] = row;
987           ct3++;
988         }
989       }
990       for (k=oct2,max2=ct2; k<max2; k++) {
991         row   = xdata[i][k] - rstart;
992         start = ai[row];
993         end   = ai[row+1];
994         for (l=start; l<end; l++) {
995           val = aj[l] + cstart;
996           if (!PetscBTLookupSet(xtable,val)) {
997             if (!(ct3 < mem_estimate)) {
998               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
999               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1000               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1001               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1002               xdata[0]     = tmp;
1003               mem_estimate = new_estimate; ++no_malloc;
1004               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1005             }
1006             xdata[i][ct2++] = val;
1007             ct3++;
1008           }
1009         }
1010         start = bi[row];
1011         end   = bi[row+1];
1012         for (l=start; l<end; l++) {
1013           val = garray[bj[l]];
1014           if (!PetscBTLookupSet(xtable,val)) {
1015             if (!(ct3 < mem_estimate)) {
1016               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1017               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1018               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1019               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1020               xdata[0]     = tmp;
1021               mem_estimate = new_estimate; ++no_malloc;
1022               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1023             }
1024             xdata[i][ct2++] = val;
1025             ct3++;
1026           }
1027         }
1028       }
1029       /* Update the header*/
1030       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1031       xdata[i][2*j-1] = rbuf_i[2*j-1];
1032     }
1033     xdata[i][0] = rbuf_0;
1034     xdata[i+1]  = xdata[i] + ct2;
1035     isz1[i]     = ct2; /* size of each message */
1036   }
1037   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1038   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1039   PetscFunctionReturn(0);
1040 }
1041 /* -------------------------------------------------------------------------*/
1042 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1043 /*
1044     Every processor gets the entire matrix
1045 */
1046 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1047 {
1048   Mat            B;
1049   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1050   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1051   PetscErrorCode ierr;
1052   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1053   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1054   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1055 
1056   PetscFunctionBegin;
1057   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1058   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr);
1059   if (scall == MAT_INITIAL_MATRIX) {
1060     /* ----------------------------------------------------------------
1061          Tell every processor the number of nonzeros per row
1062     */
1063     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1064     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1065       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1066     }
1067     ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1068     for (i=0; i<size; i++) {
1069       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1070       displs[i]     = A->rmap->range[i];
1071     }
1072 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1073     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1074 #else
1075     sendcount = A->rmap->rend - A->rmap->rstart;
1076     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1077 #endif
1078     /* ---------------------------------------------------------------
1079          Create the sequential matrix of the same type as the local block diagonal
1080     */
1081     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1082     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1083     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1084     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1085     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1086     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1087     **Bin = B;
1088     b     = (Mat_SeqAIJ*)B->data;
1089 
1090     /*--------------------------------------------------------------------
1091        Copy my part of matrix column indices over
1092     */
1093     sendcount  = ad->nz + bd->nz;
1094     jsendbuf   = b->j + b->i[rstarts[rank]];
1095     a_jsendbuf = ad->j;
1096     b_jsendbuf = bd->j;
1097     n          = A->rmap->rend - A->rmap->rstart;
1098     cnt        = 0;
1099     for (i=0; i<n; i++) {
1100       /* put in lower diagonal portion */
1101       m = bd->i[i+1] - bd->i[i];
1102       while (m > 0) {
1103         /* is it above diagonal (in bd (compressed) numbering) */
1104         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1105         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1106         m--;
1107       }
1108 
1109       /* put in diagonal portion */
1110       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1111         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1112       }
1113 
1114       /* put in upper diagonal portion */
1115       while (m-- > 0) {
1116         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1117       }
1118     }
1119     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1120 
1121     /*--------------------------------------------------------------------
1122        Gather all column indices to all processors
1123     */
1124     for (i=0; i<size; i++) {
1125       recvcounts[i] = 0;
1126       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1127         recvcounts[i] += lens[j];
1128       }
1129     }
1130     displs[0] = 0;
1131     for (i=1; i<size; i++) {
1132       displs[i] = displs[i-1] + recvcounts[i-1];
1133     }
1134 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1135     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1136 #else
1137     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1138 #endif
1139     /*--------------------------------------------------------------------
1140         Assemble the matrix into useable form (note numerical values not yet set)
1141     */
1142     /* set the b->ilen (length of each row) values */
1143     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1144     /* set the b->i indices */
1145     b->i[0] = 0;
1146     for (i=1; i<=A->rmap->N; i++) {
1147       b->i[i] = b->i[i-1] + lens[i-1];
1148     }
1149     ierr = PetscFree(lens);CHKERRQ(ierr);
1150     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1151     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1152 
1153   } else {
1154     B = **Bin;
1155     b = (Mat_SeqAIJ*)B->data;
1156   }
1157 
1158   /*--------------------------------------------------------------------
1159        Copy my part of matrix numerical values into the values location
1160   */
1161   if (flag == MAT_GET_VALUES) {
1162     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1163     MatScalar         *sendbuf,*recvbuf;
1164 
1165     ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr);
1166     ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr);
1167     sendcount = ad->nz + bd->nz;
1168     sendbuf   = b->a + b->i[rstarts[rank]];
1169     a_sendbuf = ada;
1170     b_sendbuf = bda;
1171     b_sendj   = bd->j;
1172     n         = A->rmap->rend - A->rmap->rstart;
1173     cnt       = 0;
1174     for (i=0; i<n; i++) {
1175       /* put in lower diagonal portion */
1176       m = bd->i[i+1] - bd->i[i];
1177       while (m > 0) {
1178         /* is it above diagonal (in bd (compressed) numbering) */
1179         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1180         sendbuf[cnt++] = *b_sendbuf++;
1181         m--;
1182         b_sendj++;
1183       }
1184 
1185       /* put in diagonal portion */
1186       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1187         sendbuf[cnt++] = *a_sendbuf++;
1188       }
1189 
1190       /* put in upper diagonal portion */
1191       while (m-- > 0) {
1192         sendbuf[cnt++] = *b_sendbuf++;
1193         b_sendj++;
1194       }
1195     }
1196     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1197 
1198     /* -----------------------------------------------------------------
1199        Gather all numerical values to all processors
1200     */
1201     if (!recvcounts) {
1202       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1203     }
1204     for (i=0; i<size; i++) {
1205       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1206     }
1207     displs[0] = 0;
1208     for (i=1; i<size; i++) {
1209       displs[i] = displs[i-1] + recvcounts[i-1];
1210     }
1211     recvbuf = b->a;
1212 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1213     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1214 #else
1215     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1216 #endif
1217     ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr);
1218     ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr);
1219   }  /* endof (flag == MAT_GET_VALUES) */
1220   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1221 
1222   if (A->symmetric) {
1223     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1224   } else if (A->hermitian) {
1225     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1226   } else if (A->structurally_symmetric) {
1227     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1228   }
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1233 {
1234   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1235   Mat            submat,A = c->A,B = c->B;
1236   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1237   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1238   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1239   const PetscInt *icol,*irow;
1240   PetscInt       nrow,ncol,start;
1241   PetscErrorCode ierr;
1242   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1243   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1244   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1245   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1246   PetscInt       *lens,rmax,ncols,*cols,Crow;
1247 #if defined(PETSC_USE_CTABLE)
1248   PetscTable     cmap,rmap;
1249   PetscInt       *cmap_loc,*rmap_loc;
1250 #else
1251   PetscInt       *cmap,*rmap;
1252 #endif
1253   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1254   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1255   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1256   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1257   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1258   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1259   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1260   MPI_Comm       comm;
1261   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1262   PetscMPIInt    *onodes1,*olengths1,idex,end;
1263   Mat_SubSppt    *smatis1;
1264   PetscBool      isrowsorted,iscolsorted;
1265 
1266   PetscFunctionBegin;
1267   PetscValidLogicalCollectiveInt(C,ismax,2);
1268   PetscValidLogicalCollectiveEnum(C,scall,5);
1269   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1270   ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1271   ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1272   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1273   size = c->size;
1274   rank = c->rank;
1275 
1276   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1277   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1278   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1279   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1280   if (allcolumns) {
1281     icol = NULL;
1282     ncol = C->cmap->N;
1283   } else {
1284     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1285     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1286   }
1287 
1288   if (scall == MAT_INITIAL_MATRIX) {
1289     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1290 
1291     /* Get some new tags to keep the communication clean */
1292     tag1 = ((PetscObject)C)->tag;
1293     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1294     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1295 
1296     /* evaluate communication - mesg to who, length of mesg, and buffer space
1297      required. Based on this, buffers are allocated, and data copied into them */
1298     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1299     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1300 
1301     /* w1[proc] = num of rows owned by proc -- to be requested */
1302     proc = 0;
1303     nrqs = 0; /* num of outgoing messages */
1304     for (j=0; j<nrow; j++) {
1305       row  = irow[j];
1306       if (!isrowsorted) proc = 0;
1307       while (row >= C->rmap->range[proc+1]) proc++;
1308       w1[proc]++;
1309       row2proc[j] = proc; /* map row index to proc */
1310 
1311       if (proc != rank && !w2[proc]) {
1312         w2[proc] = 1; nrqs++;
1313       }
1314     }
1315     w1[rank] = 0;  /* rows owned by self will not be requested */
1316 
1317     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1318     for (proc=0,j=0; proc<size; proc++) {
1319       if (w1[proc]) { pa[j++] = proc;}
1320     }
1321 
1322     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1323     msz = 0;              /* total mesg length (for all procs) */
1324     for (i=0; i<nrqs; i++) {
1325       proc      = pa[i];
1326       w1[proc] += 3;
1327       msz      += w1[proc];
1328     }
1329     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1330 
1331     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1332     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1333     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1334 
1335     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1336        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1337     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1338 
1339     /* Now post the Irecvs corresponding to these messages */
1340     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1341 
1342     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1343     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1344 
1345     /* Allocate Memory for outgoing messages */
1346     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1347     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1348     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1349 
1350     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1351     iptr = tmp;
1352     for (i=0; i<nrqs; i++) {
1353       proc        = pa[i];
1354       sbuf1[proc] = iptr;
1355       iptr       += w1[proc];
1356     }
1357 
1358     /* Form the outgoing messages */
1359     /* Initialize the header space */
1360     for (i=0; i<nrqs; i++) {
1361       proc      = pa[i];
1362       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1363       ptr[proc] = sbuf1[proc] + 3;
1364     }
1365 
1366     /* Parse the isrow and copy data into outbuf */
1367     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1368     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1369       proc = row2proc[j];
1370       if (proc != rank) { /* copy to the outgoing buf*/
1371         *ptr[proc] = irow[j];
1372         ctr[proc]++; ptr[proc]++;
1373       }
1374     }
1375 
1376     /* Update the headers for the current IS */
1377     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1378       if ((ctr_j = ctr[j])) {
1379         sbuf1_j        = sbuf1[j];
1380         k              = ++sbuf1_j[0];
1381         sbuf1_j[2*k]   = ctr_j;
1382         sbuf1_j[2*k-1] = 0;
1383       }
1384     }
1385 
1386     /* Now post the sends */
1387     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1388     for (i=0; i<nrqs; ++i) {
1389       proc = pa[i];
1390       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
1391     }
1392 
1393     /* Post Receives to capture the buffer size */
1394     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1395     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1396 
1397     rbuf2[0] = tmp + msz;
1398     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1399 
1400     for (i=0; i<nrqs; ++i) {
1401       proc = pa[i];
1402       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
1403     }
1404 
1405     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1406 
1407     /* Send to other procs the buf size they should allocate */
1408     /* Receive messages*/
1409     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1410     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1411 
1412     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
1413     for (i=0; i<nrqr; ++i) {
1414       req_size[i] = 0;
1415       rbuf1_i        = rbuf1[i];
1416       start          = 2*rbuf1_i[0] + 1;
1417       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
1418       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1419       sbuf2_i        = sbuf2[i];
1420       for (j=start; j<end; j++) {
1421         k            = rbuf1_i[j] - rstart;
1422         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1423         sbuf2_i[j]   = ncols;
1424         req_size[i] += ncols;
1425       }
1426       req_source1[i] = r_status1[i].MPI_SOURCE;
1427 
1428       /* form the header */
1429       sbuf2_i[0] = req_size[i];
1430       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1431 
1432       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
1433     }
1434 
1435     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1436     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1437 
1438     /* rbuf2 is received, Post recv column indices a->j */
1439     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
1440 
1441     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1442     for (i=0; i<nrqs; ++i) {
1443       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1444       req_source2[i] = r_status2[i].MPI_SOURCE;
1445       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
1446     }
1447 
1448     /* Wait on sends1 and sends2 */
1449     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1450     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);
1451     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1452     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1453 
1454     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);
1455     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1456 
1457     /* Now allocate sending buffers for a->j, and send them off */
1458     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1459     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1460     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1461     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1462 
1463     for (i=0; i<nrqr; i++) { /* for each requested message */
1464       rbuf1_i   = rbuf1[i];
1465       sbuf_aj_i = sbuf_aj[i];
1466       ct1       = 2*rbuf1_i[0] + 1;
1467       ct2       = 0;
1468       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1469 
1470       kmax = rbuf1[i][2];
1471       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1472         row    = rbuf1_i[ct1] - rstart;
1473         nzA    = ai[row+1] - ai[row];
1474         nzB    = bi[row+1] - bi[row];
1475         ncols  = nzA + nzB;
1476         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1477 
1478         /* load the column indices for this row into cols*/
1479         cols = sbuf_aj_i + ct2;
1480 
1481         lwrite = 0;
1482         for (l=0; l<nzB; l++) {
1483           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1484         }
1485         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1486         for (l=0; l<nzB; l++) {
1487           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1488         }
1489 
1490         ct2 += ncols;
1491       }
1492       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
1493     }
1494 
1495     /* create column map (cmap): global col of C -> local col of submat */
1496 #if defined(PETSC_USE_CTABLE)
1497     if (!allcolumns) {
1498       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1499       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1500       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1501         if (icol[j] >= cstart && icol[j] <cend) {
1502           cmap_loc[icol[j] - cstart] = j+1;
1503         } else { /* use PetscTable for non-local col indices */
1504           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1505         }
1506       }
1507     } else {
1508       cmap     = NULL;
1509       cmap_loc = NULL;
1510     }
1511     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1512 #else
1513     if (!allcolumns) {
1514       ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1515       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1516     } else {
1517       cmap = NULL;
1518     }
1519 #endif
1520 
1521     /* Create lens for MatSeqAIJSetPreallocation() */
1522     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1523 
1524     /* Compute lens from local part of C */
1525     for (j=0; j<nrow; j++) {
1526       row  = irow[j];
1527       proc = row2proc[j];
1528       if (proc == rank) {
1529         /* diagonal part A = c->A */
1530         ncols = ai[row-rstart+1] - ai[row-rstart];
1531         cols  = aj + ai[row-rstart];
1532         if (!allcolumns) {
1533           for (k=0; k<ncols; k++) {
1534 #if defined(PETSC_USE_CTABLE)
1535             tcol = cmap_loc[cols[k]];
1536 #else
1537             tcol = cmap[cols[k]+cstart];
1538 #endif
1539             if (tcol) lens[j]++;
1540           }
1541         } else { /* allcolumns */
1542           lens[j] = ncols;
1543         }
1544 
1545         /* off-diagonal part B = c->B */
1546         ncols = bi[row-rstart+1] - bi[row-rstart];
1547         cols  = bj + bi[row-rstart];
1548         if (!allcolumns) {
1549           for (k=0; k<ncols; k++) {
1550 #if defined(PETSC_USE_CTABLE)
1551             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1552 #else
1553             tcol = cmap[bmap[cols[k]]];
1554 #endif
1555             if (tcol) lens[j]++;
1556           }
1557         } else { /* allcolumns */
1558           lens[j] += ncols;
1559         }
1560       }
1561     }
1562 
1563     /* Create row map (rmap): global row of C -> local row of submat */
1564 #if defined(PETSC_USE_CTABLE)
1565     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1566     for (j=0; j<nrow; j++) {
1567       row  = irow[j];
1568       proc = row2proc[j];
1569       if (proc == rank) { /* a local row */
1570         rmap_loc[row - rstart] = j;
1571       } else {
1572         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1573       }
1574     }
1575 #else
1576     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1577     for (j=0; j<nrow; j++) {
1578       rmap[irow[j]] = j;
1579     }
1580 #endif
1581 
1582     /* Update lens from offproc data */
1583     /* recv a->j is done */
1584     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
1585     for (i=0; i<nrqs; i++) {
1586       proc    = pa[i];
1587       sbuf1_i = sbuf1[proc];
1588       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1589       ct1     = 2 + 1;
1590       ct2     = 0;
1591       rbuf2_i = rbuf2[i]; /* received length of C->j */
1592       rbuf3_i = rbuf3[i]; /* received C->j */
1593 
1594       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1595       max1   = sbuf1_i[2];
1596       for (k=0; k<max1; k++,ct1++) {
1597 #if defined(PETSC_USE_CTABLE)
1598         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1599         row--;
1600         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1601 #else
1602         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1603 #endif
1604         /* Now, store row index of submat in sbuf1_i[ct1] */
1605         sbuf1_i[ct1] = row;
1606 
1607         nnz = rbuf2_i[ct1];
1608         if (!allcolumns) {
1609           for (l=0; l<nnz; l++,ct2++) {
1610 #if defined(PETSC_USE_CTABLE)
1611             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1612               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1613             } else {
1614               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1615             }
1616 #else
1617             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1618 #endif
1619             if (tcol) lens[row]++;
1620           }
1621         } else { /* allcolumns */
1622           lens[row] += nnz;
1623         }
1624       }
1625     }
1626     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);
1627     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1628 
1629     /* Create the submatrices */
1630     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1631     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1632 
1633     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1634     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1635     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1636     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1637     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1638 
1639     /* create struct Mat_SubSppt and attached it to submat */
1640     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1641     subc = (Mat_SeqAIJ*)submat->data;
1642     subc->submatis1 = smatis1;
1643 
1644     smatis1->id          = 0;
1645     smatis1->nrqs        = nrqs;
1646     smatis1->nrqr        = nrqr;
1647     smatis1->rbuf1       = rbuf1;
1648     smatis1->rbuf2       = rbuf2;
1649     smatis1->rbuf3       = rbuf3;
1650     smatis1->sbuf2       = sbuf2;
1651     smatis1->req_source2 = req_source2;
1652 
1653     smatis1->sbuf1       = sbuf1;
1654     smatis1->ptr         = ptr;
1655     smatis1->tmp         = tmp;
1656     smatis1->ctr         = ctr;
1657 
1658     smatis1->pa           = pa;
1659     smatis1->req_size     = req_size;
1660     smatis1->req_source1  = req_source1;
1661 
1662     smatis1->allcolumns  = allcolumns;
1663     smatis1->singleis    = PETSC_TRUE;
1664     smatis1->row2proc    = row2proc;
1665     smatis1->rmap        = rmap;
1666     smatis1->cmap        = cmap;
1667 #if defined(PETSC_USE_CTABLE)
1668     smatis1->rmap_loc    = rmap_loc;
1669     smatis1->cmap_loc    = cmap_loc;
1670 #endif
1671 
1672     smatis1->destroy     = submat->ops->destroy;
1673     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1674     submat->factortype   = C->factortype;
1675 
1676     /* compute rmax */
1677     rmax = 0;
1678     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1679 
1680   } else { /* scall == MAT_REUSE_MATRIX */
1681     submat = submats[0];
1682     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1683 
1684     subc    = (Mat_SeqAIJ*)submat->data;
1685     rmax    = subc->rmax;
1686     smatis1 = subc->submatis1;
1687     nrqs        = smatis1->nrqs;
1688     nrqr        = smatis1->nrqr;
1689     rbuf1       = smatis1->rbuf1;
1690     rbuf2       = smatis1->rbuf2;
1691     rbuf3       = smatis1->rbuf3;
1692     req_source2 = smatis1->req_source2;
1693 
1694     sbuf1     = smatis1->sbuf1;
1695     sbuf2     = smatis1->sbuf2;
1696     ptr       = smatis1->ptr;
1697     tmp       = smatis1->tmp;
1698     ctr       = smatis1->ctr;
1699 
1700     pa         = smatis1->pa;
1701     req_size   = smatis1->req_size;
1702     req_source1 = smatis1->req_source1;
1703 
1704     allcolumns = smatis1->allcolumns;
1705     row2proc   = smatis1->row2proc;
1706     rmap       = smatis1->rmap;
1707     cmap       = smatis1->cmap;
1708 #if defined(PETSC_USE_CTABLE)
1709     rmap_loc   = smatis1->rmap_loc;
1710     cmap_loc   = smatis1->cmap_loc;
1711 #endif
1712   }
1713 
1714   /* Post recv matrix values */
1715   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1716   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1717   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1718   for (i=0; i<nrqs; ++i) {
1719     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1720     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
1721   }
1722 
1723   /* Allocate sending buffers for a->a, and send them off */
1724   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1725   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1726   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1727   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1728 
1729   for (i=0; i<nrqr; i++) {
1730     rbuf1_i   = rbuf1[i];
1731     sbuf_aa_i = sbuf_aa[i];
1732     ct1       = 2*rbuf1_i[0]+1;
1733     ct2       = 0;
1734     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1735 
1736     kmax = rbuf1_i[2];
1737     for (k=0; k<kmax; k++,ct1++) {
1738       row = rbuf1_i[ct1] - rstart;
1739       nzA = ai[row+1] - ai[row];
1740       nzB = bi[row+1] - bi[row];
1741       ncols  = nzA + nzB;
1742       cworkB = bj + bi[row];
1743       vworkA = a_a + ai[row];
1744       vworkB = b_a + bi[row];
1745 
1746       /* load the column values for this row into vals*/
1747       vals = sbuf_aa_i + ct2;
1748 
1749       lwrite = 0;
1750       for (l=0; l<nzB; l++) {
1751         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1752       }
1753       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1754       for (l=0; l<nzB; l++) {
1755         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1756       }
1757 
1758       ct2 += ncols;
1759     }
1760     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
1761   }
1762 
1763   /* Assemble submat */
1764   /* First assemble the local rows */
1765   for (j=0; j<nrow; j++) {
1766     row  = irow[j];
1767     proc = row2proc[j];
1768     if (proc == rank) {
1769       Crow = row - rstart;  /* local row index of C */
1770 #if defined(PETSC_USE_CTABLE)
1771       row = rmap_loc[Crow]; /* row index of submat */
1772 #else
1773       row = rmap[row];
1774 #endif
1775 
1776       if (allcolumns) {
1777         /* diagonal part A = c->A */
1778         ncols = ai[Crow+1] - ai[Crow];
1779         cols  = aj + ai[Crow];
1780         vals  = a_a + ai[Crow];
1781         i     = 0;
1782         for (k=0; k<ncols; k++) {
1783           subcols[i]   = cols[k] + cstart;
1784           subvals[i++] = vals[k];
1785         }
1786 
1787         /* off-diagonal part B = c->B */
1788         ncols = bi[Crow+1] - bi[Crow];
1789         cols  = bj + bi[Crow];
1790         vals  = b_a + bi[Crow];
1791         for (k=0; k<ncols; k++) {
1792           subcols[i]   = bmap[cols[k]];
1793           subvals[i++] = vals[k];
1794         }
1795 
1796         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1797 
1798       } else { /* !allcolumns */
1799 #if defined(PETSC_USE_CTABLE)
1800         /* diagonal part A = c->A */
1801         ncols = ai[Crow+1] - ai[Crow];
1802         cols  = aj + ai[Crow];
1803         vals  = a_a + ai[Crow];
1804         i     = 0;
1805         for (k=0; k<ncols; k++) {
1806           tcol = cmap_loc[cols[k]];
1807           if (tcol) {
1808             subcols[i]   = --tcol;
1809             subvals[i++] = vals[k];
1810           }
1811         }
1812 
1813         /* off-diagonal part B = c->B */
1814         ncols = bi[Crow+1] - bi[Crow];
1815         cols  = bj + bi[Crow];
1816         vals  = b_a + bi[Crow];
1817         for (k=0; k<ncols; k++) {
1818           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1819           if (tcol) {
1820             subcols[i]   = --tcol;
1821             subvals[i++] = vals[k];
1822           }
1823         }
1824 #else
1825         /* diagonal part A = c->A */
1826         ncols = ai[Crow+1] - ai[Crow];
1827         cols  = aj + ai[Crow];
1828         vals  = a_a + ai[Crow];
1829         i     = 0;
1830         for (k=0; k<ncols; k++) {
1831           tcol = cmap[cols[k]+cstart];
1832           if (tcol) {
1833             subcols[i]   = --tcol;
1834             subvals[i++] = vals[k];
1835           }
1836         }
1837 
1838         /* off-diagonal part B = c->B */
1839         ncols = bi[Crow+1] - bi[Crow];
1840         cols  = bj + bi[Crow];
1841         vals  = b_a + bi[Crow];
1842         for (k=0; k<ncols; k++) {
1843           tcol = cmap[bmap[cols[k]]];
1844           if (tcol) {
1845             subcols[i]   = --tcol;
1846             subvals[i++] = vals[k];
1847           }
1848         }
1849 #endif
1850         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1851       }
1852     }
1853   }
1854 
1855   /* Now assemble the off-proc rows */
1856   for (i=0; i<nrqs; i++) { /* for each requested message */
1857     /* recv values from other processes */
1858     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr);
1859     proc    = pa[idex];
1860     sbuf1_i = sbuf1[proc];
1861     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1862     ct1     = 2 + 1;
1863     ct2     = 0; /* count of received C->j */
1864     ct3     = 0; /* count of received C->j that will be inserted into submat */
1865     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1866     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1867     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1868 
1869     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1870     max1 = sbuf1_i[2];             /* num of rows */
1871     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1872       row = sbuf1_i[ct1]; /* row index of submat */
1873       if (!allcolumns) {
1874         idex = 0;
1875         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1876           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1877           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1878 #if defined(PETSC_USE_CTABLE)
1879             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1880               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1881             } else {
1882               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1883             }
1884 #else
1885             tcol = cmap[rbuf3_i[ct2]];
1886 #endif
1887             if (tcol) {
1888               subcols[idex]   = --tcol; /* may not be sorted */
1889               subvals[idex++] = rbuf4_i[ct2];
1890 
1891               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1892                For reuse, we replace received C->j with index that should be inserted to submat */
1893               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1894             }
1895           }
1896           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1897         } else { /* scall == MAT_REUSE_MATRIX */
1898           submat = submats[0];
1899           subc   = (Mat_SeqAIJ*)submat->data;
1900 
1901           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1902           for (l=0; l<nnz; l++) {
1903             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1904             subvals[idex++] = rbuf4_i[ct2];
1905           }
1906 
1907           bj = subc->j + subc->i[row]; /* sorted column indices */
1908           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1909         }
1910       } else { /* allcolumns */
1911         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1912         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1913         ct2 += nnz;
1914       }
1915     }
1916   }
1917 
1918   /* sending a->a are done */
1919   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);
1920   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1921 
1922   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1923   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1924   submats[0] = submat;
1925 
1926   /* Restore the indices */
1927   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1928   if (!allcolumns) {
1929     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1930   }
1931 
1932   /* Destroy allocated memory */
1933   for (i=0; i<nrqs; ++i) {
1934     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1935   }
1936   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1937   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1938   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1939 
1940   if (scall == MAT_INITIAL_MATRIX) {
1941     ierr = PetscFree(lens);CHKERRQ(ierr);
1942     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1943     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1944   }
1945   ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1946   ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1951 {
1952   PetscErrorCode ierr;
1953   PetscInt       ncol;
1954   PetscBool      colflag,allcolumns=PETSC_FALSE;
1955 
1956   PetscFunctionBegin;
1957   /* Allocate memory to hold all the submatrices */
1958   if (scall == MAT_INITIAL_MATRIX) {
1959     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1960   }
1961 
1962   /* Check for special case: each processor gets entire matrix columns */
1963   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1964   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1965   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1966 
1967   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1968   PetscFunctionReturn(0);
1969 }
1970 
1971 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1972 {
1973   PetscErrorCode ierr;
1974   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1975   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1976   Mat_SeqAIJ     *subc;
1977   Mat_SubSppt    *smat;
1978 
1979   PetscFunctionBegin;
1980   /* Check for special case: each processor has a single IS */
1981   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1982     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1983     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1984     PetscFunctionReturn(0);
1985   }
1986 
1987   /* Collect global wantallmatrix and nstages */
1988   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1989   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1990   if (!nmax) nmax = 1;
1991 
1992   if (scall == MAT_INITIAL_MATRIX) {
1993     /* Collect global wantallmatrix and nstages */
1994     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1995       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1996       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1997       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1998       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1999       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2000         wantallmatrix = PETSC_TRUE;
2001 
2002         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2003       }
2004     }
2005 
2006     /* Determine the number of stages through which submatrices are done
2007        Each stage will extract nmax submatrices.
2008        nmax is determined by the matrix column dimension.
2009        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2010     */
2011     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2012 
2013     in[0] = -1*(PetscInt)wantallmatrix;
2014     in[1] = nstages;
2015     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
2016     wantallmatrix = (PetscBool)(-out[0]);
2017     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2018 
2019   } else { /* MAT_REUSE_MATRIX */
2020     if (ismax) {
2021       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2022       smat = subc->submatis1;
2023     } else { /* (*submat)[0] is a dummy matrix */
2024       smat = (Mat_SubSppt*)(*submat)[0]->data;
2025     }
2026     if (!smat) {
2027       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2028       wantallmatrix = PETSC_TRUE;
2029     } else if (smat->singleis) {
2030       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2031       PetscFunctionReturn(0);
2032     } else {
2033       nstages = smat->nstages;
2034     }
2035   }
2036 
2037   if (wantallmatrix) {
2038     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2039     PetscFunctionReturn(0);
2040   }
2041 
2042   /* Allocate memory to hold all the submatrices and dummy submatrices */
2043   if (scall == MAT_INITIAL_MATRIX) {
2044     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2045   }
2046 
2047   for (i=0,pos=0; i<nstages; i++) {
2048     if (pos+nmax <= ismax) max_no = nmax;
2049     else if (pos >= ismax) max_no = 0;
2050     else                   max_no = ismax-pos;
2051 
2052     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2053     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2054       smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2055       smat->nstages = nstages;
2056     }
2057     pos += max_no;
2058   }
2059 
2060   if (ismax && scall == MAT_INITIAL_MATRIX) {
2061     /* save nstages for reuse */
2062     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2063     smat = subc->submatis1;
2064     smat->nstages = nstages;
2065   }
2066   PetscFunctionReturn(0);
2067 }
2068 
2069 /* -------------------------------------------------------------------------*/
2070 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2071 {
2072   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2073   Mat            A  = c->A;
2074   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2075   const PetscInt **icol,**irow;
2076   PetscInt       *nrow,*ncol,start;
2077   PetscErrorCode ierr;
2078   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2079   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2080   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2081   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2082   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2083 #if defined(PETSC_USE_CTABLE)
2084   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2085 #else
2086   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2087 #endif
2088   const PetscInt *irow_i;
2089   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2090   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2091   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2092   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2093   MPI_Status     *r_status3,*r_status4,*s_status4;
2094   MPI_Comm       comm;
2095   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2096   PetscMPIInt    *onodes1,*olengths1,end;
2097   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2098   Mat_SubSppt    *smat_i;
2099   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2100   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2101 
2102   PetscFunctionBegin;
2103   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2104   size = c->size;
2105   rank = c->rank;
2106 
2107   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2108   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2109 
2110   for (i=0; i<ismax; i++) {
2111     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2112     if (!issorted[i]) iscsorted = issorted[i];
2113 
2114     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2115 
2116     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2117     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2118 
2119     /* Check for special case: allcolumn */
2120     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2121     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2122     if (colflag && ncol[i] == C->cmap->N) {
2123       allcolumns[i] = PETSC_TRUE;
2124       icol[i] = NULL;
2125     } else {
2126       allcolumns[i] = PETSC_FALSE;
2127       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2128     }
2129   }
2130 
2131   if (scall == MAT_REUSE_MATRIX) {
2132     /* Assumes new rows are same length as the old rows */
2133     for (i=0; i<ismax; i++) {
2134       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2135       subc = (Mat_SeqAIJ*)submats[i]->data;
2136       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2137 
2138       /* Initial matrix as if empty */
2139       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2140 
2141       smat_i   = subc->submatis1;
2142 
2143       nrqs        = smat_i->nrqs;
2144       nrqr        = smat_i->nrqr;
2145       rbuf1       = smat_i->rbuf1;
2146       rbuf2       = smat_i->rbuf2;
2147       rbuf3       = smat_i->rbuf3;
2148       req_source2 = smat_i->req_source2;
2149 
2150       sbuf1     = smat_i->sbuf1;
2151       sbuf2     = smat_i->sbuf2;
2152       ptr       = smat_i->ptr;
2153       tmp       = smat_i->tmp;
2154       ctr       = smat_i->ctr;
2155 
2156       pa          = smat_i->pa;
2157       req_size    = smat_i->req_size;
2158       req_source1 = smat_i->req_source1;
2159 
2160       allcolumns[i] = smat_i->allcolumns;
2161       row2proc[i]   = smat_i->row2proc;
2162       rmap[i]       = smat_i->rmap;
2163       cmap[i]       = smat_i->cmap;
2164     }
2165 
2166     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2167       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2168       smat_i = (Mat_SubSppt*)submats[0]->data;
2169 
2170       nrqs        = smat_i->nrqs;
2171       nrqr        = smat_i->nrqr;
2172       rbuf1       = smat_i->rbuf1;
2173       rbuf2       = smat_i->rbuf2;
2174       rbuf3       = smat_i->rbuf3;
2175       req_source2 = smat_i->req_source2;
2176 
2177       sbuf1       = smat_i->sbuf1;
2178       sbuf2       = smat_i->sbuf2;
2179       ptr         = smat_i->ptr;
2180       tmp         = smat_i->tmp;
2181       ctr         = smat_i->ctr;
2182 
2183       pa          = smat_i->pa;
2184       req_size    = smat_i->req_size;
2185       req_source1 = smat_i->req_source1;
2186 
2187       allcolumns[0] = PETSC_FALSE;
2188     }
2189   } else { /* scall == MAT_INITIAL_MATRIX */
2190     /* Get some new tags to keep the communication clean */
2191     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2192     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2193 
2194     /* evaluate communication - mesg to who, length of mesg, and buffer space
2195      required. Based on this, buffers are allocated, and data copied into them*/
2196     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2197 
2198     for (i=0; i<ismax; i++) {
2199       jmax   = nrow[i];
2200       irow_i = irow[i];
2201 
2202       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2203       row2proc[i] = row2proc_i;
2204 
2205       if (issorted[i]) proc = 0;
2206       for (j=0; j<jmax; j++) {
2207         if (!issorted[i]) proc = 0;
2208         row = irow_i[j];
2209         while (row >= C->rmap->range[proc+1]) proc++;
2210         w4[proc]++;
2211         row2proc_i[j] = proc; /* map row index to proc */
2212       }
2213       for (j=0; j<size; j++) {
2214         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2215       }
2216     }
2217 
2218     nrqs     = 0;              /* no of outgoing messages */
2219     msz      = 0;              /* total mesg length (for all procs) */
2220     w1[rank] = 0;              /* no mesg sent to self */
2221     w3[rank] = 0;
2222     for (i=0; i<size; i++) {
2223       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2224     }
2225     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2226     for (i=0,j=0; i<size; i++) {
2227       if (w1[i]) { pa[j] = i; j++; }
2228     }
2229 
2230     /* Each message would have a header = 1 + 2*(no of IS) + data */
2231     for (i=0; i<nrqs; i++) {
2232       j      = pa[i];
2233       w1[j] += w2[j] + 2* w3[j];
2234       msz   += w1[j];
2235     }
2236     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2237 
2238     /* Determine the number of messages to expect, their lengths, from from-ids */
2239     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2240     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2241 
2242     /* Now post the Irecvs corresponding to these messages */
2243     tag0 = ((PetscObject)C)->tag;
2244     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2245 
2246     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2247     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2248 
2249     /* Allocate Memory for outgoing messages */
2250     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2251     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2252     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2253 
2254     {
2255       PetscInt *iptr = tmp;
2256       k    = 0;
2257       for (i=0; i<nrqs; i++) {
2258         j        = pa[i];
2259         iptr    += k;
2260         sbuf1[j] = iptr;
2261         k        = w1[j];
2262       }
2263     }
2264 
2265     /* Form the outgoing messages. Initialize the header space */
2266     for (i=0; i<nrqs; i++) {
2267       j           = pa[i];
2268       sbuf1[j][0] = 0;
2269       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2270       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2271     }
2272 
2273     /* Parse the isrow and copy data into outbuf */
2274     for (i=0; i<ismax; i++) {
2275       row2proc_i = row2proc[i];
2276       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2277       irow_i = irow[i];
2278       jmax   = nrow[i];
2279       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2280         proc = row2proc_i[j];
2281         if (proc != rank) { /* copy to the outgoing buf*/
2282           ctr[proc]++;
2283           *ptr[proc] = irow_i[j];
2284           ptr[proc]++;
2285         }
2286       }
2287       /* Update the headers for the current IS */
2288       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2289         if ((ctr_j = ctr[j])) {
2290           sbuf1_j        = sbuf1[j];
2291           k              = ++sbuf1_j[0];
2292           sbuf1_j[2*k]   = ctr_j;
2293           sbuf1_j[2*k-1] = i;
2294         }
2295       }
2296     }
2297 
2298     /*  Now  post the sends */
2299     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2300     for (i=0; i<nrqs; ++i) {
2301       j    = pa[i];
2302       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr);
2303     }
2304 
2305     /* Post Receives to capture the buffer size */
2306     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2307     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2308     rbuf2[0] = tmp + msz;
2309     for (i=1; i<nrqs; ++i) {
2310       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2311     }
2312     for (i=0; i<nrqs; ++i) {
2313       j    = pa[i];
2314       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
2315     }
2316 
2317     /* Send to other procs the buf size they should allocate */
2318     /* Receive messages*/
2319     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2320     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2321     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2322     {
2323       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2324       PetscInt   *sbuf2_i;
2325 
2326       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
2327       for (i=0; i<nrqr; ++i) {
2328         req_size[i] = 0;
2329         rbuf1_i        = rbuf1[i];
2330         start          = 2*rbuf1_i[0] + 1;
2331         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
2332         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2333         sbuf2_i        = sbuf2[i];
2334         for (j=start; j<end; j++) {
2335           id              = rbuf1_i[j] - rstart;
2336           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2337           sbuf2_i[j]      = ncols;
2338           req_size[i] += ncols;
2339         }
2340         req_source1[i] = r_status1[i].MPI_SOURCE;
2341         /* form the header */
2342         sbuf2_i[0] = req_size[i];
2343         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2344 
2345         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
2346       }
2347     }
2348     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2349     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2350     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2351 
2352     /* Receive messages*/
2353     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2354     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2355 
2356     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
2357     for (i=0; i<nrqs; ++i) {
2358       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2359       req_source2[i] = r_status2[i].MPI_SOURCE;
2360       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
2361     }
2362     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2363     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2364 
2365     /* Wait on sends1 and sends2 */
2366     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2367     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2368 
2369     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);}
2370     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);}
2371     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2372     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2373     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2374     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2375 
2376     /* Now allocate sending buffers for a->j, and send them off */
2377     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2378     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2379     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2380     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2381 
2382     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2383     {
2384       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2385       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2386       PetscInt cend = C->cmap->rend;
2387       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2388 
2389       for (i=0; i<nrqr; i++) {
2390         rbuf1_i   = rbuf1[i];
2391         sbuf_aj_i = sbuf_aj[i];
2392         ct1       = 2*rbuf1_i[0] + 1;
2393         ct2       = 0;
2394         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2395           kmax = rbuf1[i][2*j];
2396           for (k=0; k<kmax; k++,ct1++) {
2397             row    = rbuf1_i[ct1] - rstart;
2398             nzA    = a_i[row+1] - a_i[row];
2399             nzB    = b_i[row+1] - b_i[row];
2400             ncols  = nzA + nzB;
2401             cworkA = a_j + a_i[row];
2402             cworkB = b_j + b_i[row];
2403 
2404             /* load the column indices for this row into cols */
2405             cols = sbuf_aj_i + ct2;
2406 
2407             lwrite = 0;
2408             for (l=0; l<nzB; l++) {
2409               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2410             }
2411             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2412             for (l=0; l<nzB; l++) {
2413               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2414             }
2415 
2416             ct2 += ncols;
2417           }
2418         }
2419         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
2420       }
2421     }
2422     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2423 
2424     /* create col map: global col of C -> local col of submatrices */
2425     {
2426       const PetscInt *icol_i;
2427 #if defined(PETSC_USE_CTABLE)
2428       for (i=0; i<ismax; i++) {
2429         if (!allcolumns[i]) {
2430           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2431 
2432           jmax   = ncol[i];
2433           icol_i = icol[i];
2434           cmap_i = cmap[i];
2435           for (j=0; j<jmax; j++) {
2436             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2437           }
2438         } else cmap[i] = NULL;
2439       }
2440 #else
2441       for (i=0; i<ismax; i++) {
2442         if (!allcolumns[i]) {
2443           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2444           jmax   = ncol[i];
2445           icol_i = icol[i];
2446           cmap_i = cmap[i];
2447           for (j=0; j<jmax; j++) {
2448             cmap_i[icol_i[j]] = j+1;
2449           }
2450         } else cmap[i] = NULL;
2451       }
2452 #endif
2453     }
2454 
2455     /* Create lens which is required for MatCreate... */
2456     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2457     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2458 
2459     if (ismax) {
2460       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2461     }
2462     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2463 
2464     /* Update lens from local data */
2465     for (i=0; i<ismax; i++) {
2466       row2proc_i = row2proc[i];
2467       jmax = nrow[i];
2468       if (!allcolumns[i]) cmap_i = cmap[i];
2469       irow_i = irow[i];
2470       lens_i = lens[i];
2471       for (j=0; j<jmax; j++) {
2472         row = irow_i[j];
2473         proc = row2proc_i[j];
2474         if (proc == rank) {
2475           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2476           if (!allcolumns[i]) {
2477             for (k=0; k<ncols; k++) {
2478 #if defined(PETSC_USE_CTABLE)
2479               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2480 #else
2481               tcol = cmap_i[cols[k]];
2482 #endif
2483               if (tcol) lens_i[j]++;
2484             }
2485           } else { /* allcolumns */
2486             lens_i[j] = ncols;
2487           }
2488           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2489         }
2490       }
2491     }
2492 
2493     /* Create row map: global row of C -> local row of submatrices */
2494 #if defined(PETSC_USE_CTABLE)
2495     for (i=0; i<ismax; i++) {
2496       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2497       irow_i = irow[i];
2498       jmax   = nrow[i];
2499       for (j=0; j<jmax; j++) {
2500       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2501       }
2502     }
2503 #else
2504     for (i=0; i<ismax; i++) {
2505       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2506       rmap_i = rmap[i];
2507       irow_i = irow[i];
2508       jmax   = nrow[i];
2509       for (j=0; j<jmax; j++) {
2510         rmap_i[irow_i[j]] = j;
2511       }
2512     }
2513 #endif
2514 
2515     /* Update lens from offproc data */
2516     {
2517       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2518 
2519       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
2520       for (tmp2=0; tmp2<nrqs; tmp2++) {
2521         sbuf1_i = sbuf1[pa[tmp2]];
2522         jmax    = sbuf1_i[0];
2523         ct1     = 2*jmax+1;
2524         ct2     = 0;
2525         rbuf2_i = rbuf2[tmp2];
2526         rbuf3_i = rbuf3[tmp2];
2527         for (j=1; j<=jmax; j++) {
2528           is_no  = sbuf1_i[2*j-1];
2529           max1   = sbuf1_i[2*j];
2530           lens_i = lens[is_no];
2531           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2532           rmap_i = rmap[is_no];
2533           for (k=0; k<max1; k++,ct1++) {
2534 #if defined(PETSC_USE_CTABLE)
2535             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2536             row--;
2537             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2538 #else
2539             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2540 #endif
2541             max2 = rbuf2_i[ct1];
2542             for (l=0; l<max2; l++,ct2++) {
2543               if (!allcolumns[is_no]) {
2544 #if defined(PETSC_USE_CTABLE)
2545                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2546 #else
2547                 tcol = cmap_i[rbuf3_i[ct2]];
2548 #endif
2549                 if (tcol) lens_i[row]++;
2550               } else { /* allcolumns */
2551                 lens_i[row]++; /* lens_i[row] += max2 ? */
2552               }
2553             }
2554           }
2555         }
2556       }
2557     }
2558     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2559     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);}
2560     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2561     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2562 
2563     /* Create the submatrices */
2564     for (i=0; i<ismax; i++) {
2565       PetscInt    rbs,cbs;
2566 
2567       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2568       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2569 
2570       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2571       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2572 
2573       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2574       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2575       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2576 
2577       /* create struct Mat_SubSppt and attached it to submat */
2578       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2579       subc = (Mat_SeqAIJ*)submats[i]->data;
2580       subc->submatis1 = smat_i;
2581 
2582       smat_i->destroy          = submats[i]->ops->destroy;
2583       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2584       submats[i]->factortype   = C->factortype;
2585 
2586       smat_i->id          = i;
2587       smat_i->nrqs        = nrqs;
2588       smat_i->nrqr        = nrqr;
2589       smat_i->rbuf1       = rbuf1;
2590       smat_i->rbuf2       = rbuf2;
2591       smat_i->rbuf3       = rbuf3;
2592       smat_i->sbuf2       = sbuf2;
2593       smat_i->req_source2 = req_source2;
2594 
2595       smat_i->sbuf1       = sbuf1;
2596       smat_i->ptr         = ptr;
2597       smat_i->tmp         = tmp;
2598       smat_i->ctr         = ctr;
2599 
2600       smat_i->pa           = pa;
2601       smat_i->req_size     = req_size;
2602       smat_i->req_source1  = req_source1;
2603 
2604       smat_i->allcolumns  = allcolumns[i];
2605       smat_i->singleis    = PETSC_FALSE;
2606       smat_i->row2proc    = row2proc[i];
2607       smat_i->rmap        = rmap[i];
2608       smat_i->cmap        = cmap[i];
2609     }
2610 
2611     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2612       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2613       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2614       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2615 
2616       /* create struct Mat_SubSppt and attached it to submat */
2617       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2618       submats[0]->data = (void*)smat_i;
2619 
2620       smat_i->destroy          = submats[0]->ops->destroy;
2621       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2622       submats[0]->factortype   = C->factortype;
2623 
2624       smat_i->id          = 0;
2625       smat_i->nrqs        = nrqs;
2626       smat_i->nrqr        = nrqr;
2627       smat_i->rbuf1       = rbuf1;
2628       smat_i->rbuf2       = rbuf2;
2629       smat_i->rbuf3       = rbuf3;
2630       smat_i->sbuf2       = sbuf2;
2631       smat_i->req_source2 = req_source2;
2632 
2633       smat_i->sbuf1       = sbuf1;
2634       smat_i->ptr         = ptr;
2635       smat_i->tmp         = tmp;
2636       smat_i->ctr         = ctr;
2637 
2638       smat_i->pa           = pa;
2639       smat_i->req_size     = req_size;
2640       smat_i->req_source1  = req_source1;
2641 
2642       smat_i->allcolumns  = PETSC_FALSE;
2643       smat_i->singleis    = PETSC_FALSE;
2644       smat_i->row2proc    = NULL;
2645       smat_i->rmap        = NULL;
2646       smat_i->cmap        = NULL;
2647     }
2648 
2649     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2650     ierr = PetscFree(lens);CHKERRQ(ierr);
2651     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2652     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2653 
2654   } /* endof scall == MAT_INITIAL_MATRIX */
2655 
2656   /* Post recv matrix values */
2657   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2658   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2659   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2660   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2661   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2662   for (i=0; i<nrqs; ++i) {
2663     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2664     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
2665   }
2666 
2667   /* Allocate sending buffers for a->a, and send them off */
2668   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2669   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2670   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2671   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2672 
2673   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2674   {
2675     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2676     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2677     PetscInt    cend   = C->cmap->rend;
2678     PetscInt    *b_j   = b->j;
2679     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2680 
2681     ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2682     ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2683     for (i=0; i<nrqr; i++) {
2684       rbuf1_i   = rbuf1[i];
2685       sbuf_aa_i = sbuf_aa[i];
2686       ct1       = 2*rbuf1_i[0]+1;
2687       ct2       = 0;
2688       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2689         kmax = rbuf1_i[2*j];
2690         for (k=0; k<kmax; k++,ct1++) {
2691           row    = rbuf1_i[ct1] - rstart;
2692           nzA    = a_i[row+1] - a_i[row];
2693           nzB    = b_i[row+1] - b_i[row];
2694           ncols  = nzA + nzB;
2695           cworkB = b_j + b_i[row];
2696           vworkA = a_a + a_i[row];
2697           vworkB = b_a + b_i[row];
2698 
2699           /* load the column values for this row into vals*/
2700           vals = sbuf_aa_i+ct2;
2701 
2702           lwrite = 0;
2703           for (l=0; l<nzB; l++) {
2704             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2705           }
2706           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2707           for (l=0; l<nzB; l++) {
2708             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2709           }
2710 
2711           ct2 += ncols;
2712         }
2713       }
2714       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
2715     }
2716     ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2717     ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2718   }
2719 
2720   /* Assemble the matrices */
2721   /* First assemble the local rows */
2722   for (i=0; i<ismax; i++) {
2723     row2proc_i = row2proc[i];
2724     subc      = (Mat_SeqAIJ*)submats[i]->data;
2725     imat_ilen = subc->ilen;
2726     imat_j    = subc->j;
2727     imat_i    = subc->i;
2728     imat_a    = subc->a;
2729 
2730     if (!allcolumns[i]) cmap_i = cmap[i];
2731     rmap_i = rmap[i];
2732     irow_i = irow[i];
2733     jmax   = nrow[i];
2734     for (j=0; j<jmax; j++) {
2735       row  = irow_i[j];
2736       proc = row2proc_i[j];
2737       if (proc == rank) {
2738         old_row = row;
2739 #if defined(PETSC_USE_CTABLE)
2740         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2741         row--;
2742 #else
2743         row = rmap_i[row];
2744 #endif
2745         ilen_row = imat_ilen[row];
2746         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2747         mat_i    = imat_i[row];
2748         mat_a    = imat_a + mat_i;
2749         mat_j    = imat_j + mat_i;
2750         if (!allcolumns[i]) {
2751           for (k=0; k<ncols; k++) {
2752 #if defined(PETSC_USE_CTABLE)
2753             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2754 #else
2755             tcol = cmap_i[cols[k]];
2756 #endif
2757             if (tcol) {
2758               *mat_j++ = tcol - 1;
2759               *mat_a++ = vals[k];
2760               ilen_row++;
2761             }
2762           }
2763         } else { /* allcolumns */
2764           for (k=0; k<ncols; k++) {
2765             *mat_j++ = cols[k];  /* global col index! */
2766             *mat_a++ = vals[k];
2767             ilen_row++;
2768           }
2769         }
2770         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2771 
2772         imat_ilen[row] = ilen_row;
2773       }
2774     }
2775   }
2776 
2777   /* Now assemble the off proc rows */
2778   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRMPI(ierr);
2779   for (tmp2=0; tmp2<nrqs; tmp2++) {
2780     sbuf1_i = sbuf1[pa[tmp2]];
2781     jmax    = sbuf1_i[0];
2782     ct1     = 2*jmax + 1;
2783     ct2     = 0;
2784     rbuf2_i = rbuf2[tmp2];
2785     rbuf3_i = rbuf3[tmp2];
2786     rbuf4_i = rbuf4[tmp2];
2787     for (j=1; j<=jmax; j++) {
2788       is_no     = sbuf1_i[2*j-1];
2789       rmap_i    = rmap[is_no];
2790       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2791       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2792       imat_ilen = subc->ilen;
2793       imat_j    = subc->j;
2794       imat_i    = subc->i;
2795       imat_a    = subc->a;
2796       max1      = sbuf1_i[2*j];
2797       for (k=0; k<max1; k++,ct1++) {
2798         row = sbuf1_i[ct1];
2799 #if defined(PETSC_USE_CTABLE)
2800         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2801         row--;
2802 #else
2803         row = rmap_i[row];
2804 #endif
2805         ilen  = imat_ilen[row];
2806         mat_i = imat_i[row];
2807         mat_a = imat_a + mat_i;
2808         mat_j = imat_j + mat_i;
2809         max2  = rbuf2_i[ct1];
2810         if (!allcolumns[is_no]) {
2811           for (l=0; l<max2; l++,ct2++) {
2812 #if defined(PETSC_USE_CTABLE)
2813             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2814 #else
2815             tcol = cmap_i[rbuf3_i[ct2]];
2816 #endif
2817             if (tcol) {
2818               *mat_j++ = tcol - 1;
2819               *mat_a++ = rbuf4_i[ct2];
2820               ilen++;
2821             }
2822           }
2823         } else { /* allcolumns */
2824           for (l=0; l<max2; l++,ct2++) {
2825             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2826             *mat_a++ = rbuf4_i[ct2];
2827             ilen++;
2828           }
2829         }
2830         imat_ilen[row] = ilen;
2831       }
2832     }
2833   }
2834 
2835   if (!iscsorted) { /* sort column indices of the rows */
2836     for (i=0; i<ismax; i++) {
2837       subc      = (Mat_SeqAIJ*)submats[i]->data;
2838       imat_j    = subc->j;
2839       imat_i    = subc->i;
2840       imat_a    = subc->a;
2841       imat_ilen = subc->ilen;
2842 
2843       if (allcolumns[i]) continue;
2844       jmax = nrow[i];
2845       for (j=0; j<jmax; j++) {
2846         mat_i = imat_i[j];
2847         mat_a = imat_a + mat_i;
2848         mat_j = imat_j + mat_i;
2849         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2850       }
2851     }
2852   }
2853 
2854   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2855   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2856   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);}
2857   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2858   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2859 
2860   /* Restore the indices */
2861   for (i=0; i<ismax; i++) {
2862     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2863     if (!allcolumns[i]) {
2864       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2865     }
2866   }
2867 
2868   for (i=0; i<ismax; i++) {
2869     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2870     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2871   }
2872 
2873   /* Destroy allocated memory */
2874   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2875   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2876   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2877 
2878   for (i=0; i<nrqs; ++i) {
2879     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2880   }
2881   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2882 
2883   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2884   PetscFunctionReturn(0);
2885 }
2886 
2887 /*
2888  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2889  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2890  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2891  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2892  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2893  state, and needs to be "assembled" later by compressing B's column space.
2894 
2895  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2896  Following this call, C->A & C->B have been created, even if empty.
2897  */
2898 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2899 {
2900   /* If making this function public, change the error returned in this function away from _PLIB. */
2901   PetscErrorCode ierr;
2902   Mat_MPIAIJ     *aij;
2903   Mat_SeqAIJ     *Baij;
2904   PetscBool      seqaij,Bdisassembled;
2905   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2906   PetscScalar    v;
2907   const PetscInt *rowindices,*colindices;
2908 
2909   PetscFunctionBegin;
2910   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2911   if (A) {
2912     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2913     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2914     if (rowemb) {
2915       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2916       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2917     } else {
2918       if (C->rmap->n != A->rmap->n) {
2919         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2920       }
2921     }
2922     if (dcolemb) {
2923       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2924       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2925     } else {
2926       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2927     }
2928   }
2929   if (B) {
2930     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2931     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2932     if (rowemb) {
2933       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2934       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2935     } else {
2936       if (C->rmap->n != B->rmap->n) {
2937         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2938       }
2939     }
2940     if (ocolemb) {
2941       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2942       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2943     } else {
2944       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2945     }
2946   }
2947 
2948   aij = (Mat_MPIAIJ*)C->data;
2949   if (!aij->A) {
2950     /* Mimic parts of MatMPIAIJSetPreallocation() */
2951     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2952     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2953     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2954     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2955     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2956   }
2957   if (A) {
2958     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2959   } else {
2960     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2961   }
2962   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2963     /*
2964       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2965       need to "disassemble" B -- convert it to using C's global indices.
2966       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2967 
2968       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2969       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2970 
2971       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2972       At least avoid calling MatSetValues() and the implied searches?
2973     */
2974 
2975     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2976 #if defined(PETSC_USE_CTABLE)
2977       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2978 #else
2979       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2980       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2981       if (aij->B) {
2982         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2983       }
2984 #endif
2985       ngcol = 0;
2986       if (aij->lvec) {
2987         ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2988       }
2989       if (aij->garray) {
2990         ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2991         ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2992       }
2993       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2994       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2995     }
2996     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2997       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2998     }
2999     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3000       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
3001     }
3002   }
3003   Bdisassembled = PETSC_FALSE;
3004   if (!aij->B) {
3005     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3006     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3007     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3008     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3009     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3010     Bdisassembled = PETSC_TRUE;
3011   }
3012   if (B) {
3013     Baij = (Mat_SeqAIJ*)B->data;
3014     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3015       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3016       for (i=0; i<B->rmap->n; i++) {
3017         nz[i] = Baij->i[i+1] - Baij->i[i];
3018       }
3019       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3020       ierr = PetscFree(nz);CHKERRQ(ierr);
3021     }
3022 
3023     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3024     shift = rend-rstart;
3025     count = 0;
3026     rowindices = NULL;
3027     colindices = NULL;
3028     if (rowemb) {
3029       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3030     }
3031     if (ocolemb) {
3032       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3033     }
3034     for (i=0; i<B->rmap->n; i++) {
3035       PetscInt row;
3036       row = i;
3037       if (rowindices) row = rowindices[i];
3038       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3039         col  = Baij->j[count];
3040         if (colindices) col = colindices[col];
3041         if (Bdisassembled && col>=rstart) col += shift;
3042         v    = Baij->a[count];
3043         ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3044         ++count;
3045       }
3046     }
3047     /* No assembly for aij->B is necessary. */
3048     /* FIXME: set aij->B's nonzerostate correctly. */
3049   } else {
3050     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3051   }
3052   C->preallocated  = PETSC_TRUE;
3053   C->was_assembled = PETSC_FALSE;
3054   C->assembled     = PETSC_FALSE;
3055    /*
3056       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3057       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3058    */
3059   PetscFunctionReturn(0);
3060 }
3061 
3062 /*
3063   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3064  */
3065 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3066 {
3067   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3068 
3069   PetscFunctionBegin;
3070   PetscValidPointer(A,2);
3071   PetscValidPointer(B,3);
3072   /* FIXME: make sure C is assembled */
3073   *A = aij->A;
3074   *B = aij->B;
3075   /* Note that we don't incref *A and *B, so be careful! */
3076   PetscFunctionReturn(0);
3077 }
3078 
3079 /*
3080   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3081   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3082 */
3083 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3084                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3085                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3086                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3087                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3088 {
3089   PetscErrorCode ierr;
3090   PetscMPIInt    size,flag;
3091   PetscInt       i,ii,cismax,ispar;
3092   Mat            *A,*B;
3093   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3094 
3095   PetscFunctionBegin;
3096   if (!ismax) PetscFunctionReturn(0);
3097 
3098   for (i = 0, cismax = 0; i < ismax; ++i) {
3099     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr);
3100     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3101     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr);
3102     if (size > 1) ++cismax;
3103   }
3104 
3105   /*
3106      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3107      ispar counts the number of parallel ISs across C's comm.
3108   */
3109   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
3110   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3111     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3112     PetscFunctionReturn(0);
3113   }
3114 
3115   /* if (ispar) */
3116   /*
3117     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3118     These are used to extract the off-diag portion of the resulting parallel matrix.
3119     The row IS for the off-diag portion is the same as for the diag portion,
3120     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3121   */
3122   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3123   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3124   for (i = 0, ii = 0; i < ismax; ++i) {
3125     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3126     if (size > 1) {
3127       /*
3128          TODO: This is the part that's ***NOT SCALABLE***.
3129          To fix this we need to extract just the indices of C's nonzero columns
3130          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3131          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3132          to be done without serializing on the IS list, so, most likely, it is best
3133          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3134       */
3135       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3136       /* Now we have to
3137          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3138              were sorted on each rank, concatenated they might no longer be sorted;
3139          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3140              indices in the nondecreasing order to the original index positions.
3141          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3142       */
3143       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3144       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3145       ++ii;
3146     }
3147   }
3148   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3149   for (i = 0, ii = 0; i < ismax; ++i) {
3150     PetscInt       j,issize;
3151     const PetscInt *indices;
3152 
3153     /*
3154        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3155      */
3156     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3157     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3158     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3159     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3160     for (j = 1; j < issize; ++j) {
3161       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3162     }
3163     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3164     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3165     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3166     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3167     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3168     for (j = 1; j < issize; ++j) {
3169       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3170     }
3171     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3172     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3173     if (size > 1) {
3174       cisrow[ii] = isrow[i];
3175       ++ii;
3176     }
3177   }
3178   /*
3179     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3180     array of sequential matrices underlying the resulting parallel matrices.
3181     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3182     contain duplicates.
3183 
3184     There are as many diag matrices as there are original index sets. There are only as many parallel
3185     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3186 
3187     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3188     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3189       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3190       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3191       with A[i] and B[ii] extracted from the corresponding MPI submat.
3192     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3193       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3194       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3195       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3196       values into A[i] and B[ii] sitting inside the corresponding submat.
3197     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3198       A[i], B[ii], AA[i] or BB[ii] matrices.
3199   */
3200   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3201   if (scall == MAT_INITIAL_MATRIX) {
3202     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3203   }
3204 
3205   /* Now obtain the sequential A and B submatrices separately. */
3206   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3207   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3208   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3209 
3210   /*
3211     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3212     matrices A & B have been extracted directly into the parallel matrices containing them, or
3213     simply into the sequential matrix identical with the corresponding A (if size == 1).
3214     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3215     to have the same sparsity pattern.
3216     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3217     must be constructed for C. This is done by setseqmat(s).
3218   */
3219   for (i = 0, ii = 0; i < ismax; ++i) {
3220     /*
3221        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3222        That way we can avoid sorting and computing permutations when reusing.
3223        To this end:
3224         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3225         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3226           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3227     */
3228     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3229 
3230     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3231     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3232     if (size > 1) {
3233       if (scall == MAT_INITIAL_MATRIX) {
3234         ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3235         ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3236         ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3237         ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3238         ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3239       }
3240       /*
3241         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3242       */
3243       {
3244         Mat AA = A[i],BB = B[ii];
3245 
3246         if (AA || BB) {
3247           ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3248           ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3249           ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3250         }
3251         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3252       }
3253       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3254       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3255       ++ii;
3256     } else { /* if (size == 1) */
3257       if (scall == MAT_REUSE_MATRIX) {
3258         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3259       }
3260       if (isrow_p[i] || iscol_p[i]) {
3261         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3262         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3263         /* Otherwise A is extracted straight into (*submats)[i]. */
3264         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3265         ierr = MatDestroy(A+i);CHKERRQ(ierr);
3266       } else (*submat)[i] = A[i];
3267     }
3268     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3269     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3270   }
3271   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3272   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3273   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3274   ierr = PetscFree(A);CHKERRQ(ierr);
3275   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3276   PetscFunctionReturn(0);
3277 }
3278 
3279 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3280 {
3281   PetscErrorCode ierr;
3282 
3283   PetscFunctionBegin;
3284   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3285   PetscFunctionReturn(0);
3286 }
3287