xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision efae93bf136c79150336cf04641425da2c58da20)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscInt       i;
28 
29   PetscFunctionBegin;
30   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
31   for (i=0; i<ov; ++i) {
32     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
33   }
34   PetscFunctionReturn(0);
35 }
36 
37 #undef __FUNCT__
38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
40 {
41   PetscErrorCode ierr;
42   PetscInt       i;
43 
44   PetscFunctionBegin;
45   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
46   for (i=0; i<ov; ++i) {
47     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
48   }
49   PetscFunctionReturn(0);
50 }
51 
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
56 {
57   PetscErrorCode   ierr;
58   MPI_Comm         comm;
59   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
60   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata,nto,nfrom;
61   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
62   const PetscInt **indices,*indices_i;
63   PetscLayout      rmap;
64   PetscMPIInt      rank,size,*toranks,*fromranks;
65   PetscSF          sf;
66   PetscSFNode     *remote;
67 
68   PetscFunctionBegin;
69   /*communicator */
70   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
71   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
72   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
73   /*row map*/
74   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
75   /*retrieve IS data*/
76   ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr);
77   /*get length and indices*/
78   for (i=0,tlength=0; i<nidx; i++){
79     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
80     tlength += length[i];
81     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
82   }
83   /*find these rows on remote processors */
84   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
85   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
86   nrrows = 0;
87   for (i=0; i<nidx; i++){
88     length_i     = length[i];
89     indices_i    = indices[i];
90     for (j=0; j<length_i; j++){
91       owner = -1;
92       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
93       /*remote processors*/
94       if (owner != rank){
95         tosizes_temp[owner]++; /*number of rows to owner*/
96         rrow_ranks[nrrows]  = owner; /*processor */
97         rrow_isids[nrrows]   = i; /*is id*/
98         remoterows[nrrows++] = indices_i[j]; /* row */
99       }
100     }
101     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
102   }
103   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
104   /*test if we need to exchange messages
105    * generally speaking, we do not need to exchange
106    * data when overlap is 1
107    * */
108   ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
109   /*we do not have any messages
110    * It usually corresponds to overlap 1
111    * */
112   if (!reducednrrows){
113     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
114     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
115     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
116     PetscFunctionReturn(0);
117   }
118   nto = 0;
119   /*send sizes and ranks*/
120   for (i=0; i<size; i++){
121    if (tosizes_temp[i]){
122      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
123      tosizes_temp[i] = nto; /* a map from processor to index */
124      toranks[nto++]  = i; /* processor */
125    }
126   }
127   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
128   for (i=0; i<nto; i++){
129     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /*offsets*/
130     tosizes[2*i+1] = toffsets[i]; /*offsets to send*/
131   }
132   /*send information to other processors*/
133   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
134   /*build a star forest */
135   nrecvrows = 0;
136   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
137   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
138   nrecvrows = 0;
139   for (i=0; i<nfrom; i++){
140     for (j=0; j<fromsizes[2*i]; j++){
141       remote[nrecvrows].rank    = fromranks[i];
142       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
143     }
144   }
145   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
146   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
147   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
148   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
149   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
150   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
151   /*message pair <no of is, row> */
152   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
153   for (i=0; i<nrrows; i++){
154     owner = rrow_ranks[i]; /* processor */
155     j     = tosizes_temp[owner]; /* index */
156     todata[toffsets[j]++] = rrow_isids[i];
157     todata[toffsets[j]++] = remoterows[i];
158   }
159   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
160   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
161   ierr = PetscFree(toffsets);CHKERRQ(ierr);
162   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
163   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
164   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
165   /*deal with remote data */
166   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
167   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
168   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
169   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
170   ierr = PetscFree(fromranks);CHKERRQ(ierr);
171   nrecvrows = 0;
172   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
173   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
174   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
175   nrecvrows = 0;
176   for (i=0; i<nto; i++){
177     for (j=0; j<tosizes[2*i]; j++){
178       remote[nrecvrows].rank    = toranks[i];
179       remote[nrecvrows++].index = tosizes[2*i+1]+j;
180     }
181   }
182   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
183   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
184   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
185   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
186   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
187   /*overlap communication and computation*/
188   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
189   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
190   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
191   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
192   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
193   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
194   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
195   ierr = PetscFree(toranks);CHKERRQ(ierr);
196   ierr = PetscFree(tosizes);CHKERRQ(ierr);
197   ierr = PetscFree(todata);CHKERRQ(ierr);
198   PetscFunctionReturn(0);
199 }
200 
201 #undef __FUNCT__
202 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
203 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
204 {
205   PetscInt         *isz,isz_i,i,j,is_id, data_size;
206   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
207   const PetscInt   *indices_i_temp;
208   PetscErrorCode    ierr;
209 
210   PetscFunctionBegin;
211   max_lsize = 0;
212   ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr);
213   for (i=0; i<nidx; i++){
214     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
215     max_lsize = lsize>max_lsize ? lsize:max_lsize;
216     isz[i]    = lsize;
217   }
218   ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr);
219   for (i=0; i<nidx; i++){
220     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
221     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
222     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
223     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
224   }
225   /*retrieve information */
226   for (i=0; i<nrecvs; ){
227     is_id      = recvdata[i++];
228     data_size  = recvdata[i++];
229     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
230     isz_i      = isz[is_id];
231     for (j=0; j< data_size; j++){
232       col = recvdata[i++];
233       indices_i[isz_i++] = col;
234     }
235     isz[is_id] = isz_i;
236   }
237   /*remove duplicate entities*/
238   for (i=0; i<nidx; i++){
239     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
240     isz_i      = isz[i];
241     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
242     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
243   }
244   ierr = PetscFree(isz);CHKERRQ(ierr);
245   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
246   PetscFunctionReturn(0);
247 }
248 
249 #undef __FUNCT__
250 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
251 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
252 {
253   PetscLayout       rmap,cmap;
254   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
255   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
256   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
257   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
258   Mat               amat,bmat;
259   PetscMPIInt       rank;
260   PetscBool         done;
261   MPI_Comm          comm;
262   PetscErrorCode    ierr;
263 
264   PetscFunctionBegin;
265   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
266   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
267   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
268   /* Even if the mat is symmetric, we still assume it is not symmetric*/
269   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
270   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
271   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
272   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
273   /*total number of nonzero values */
274   tnz  = ai[an]+bi[bn];
275   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
276   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
277   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
278   /*longest message */
279   max_fszs = 0;
280   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
281   /*better way to estimate number of nonzero in the mat???*/
282   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
283   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
284   rows_pos  = 0;
285   totalrows = 0;
286   for (i=0; i<nfrom; i++){
287     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
288     /*group data*/
289     for (j=0; j<fromsizes[2*i]; j+=2){
290       is_id                       = fromrows[rows_pos++];/*no of is*/
291       rows_i                      = rows_data[is_id];
292       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
293     }
294     /*estimate a space to avoid multiple allocations  */
295     for (j=0; j<nidx; j++){
296       indvc_ij = 0;
297       rows_i   = rows_data[j];
298       for (l=0; l<rows_pos_i[j]; l++){
299         row    = rows_i[l]-rstart;
300         start  = ai[row];
301         end    = ai[row+1];
302         for (k=start; k<end; k++){ /* Amat */
303           col = aj[k] + cstart;
304           indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/
305         }
306         start = bi[row];
307         end   = bi[row+1];
308         for (k=start; k<end; k++) { /* Bmat */
309           col = gcols[bj[k]];
310           indices_tmp[indvc_ij++] = col;
311         }
312       }
313       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
314       indv_counts[i*nidx+j] = indvc_ij;
315       totalrows            += indvc_ij;
316     }
317   }
318   /*message triple <no of is, number of rows, rows> */
319   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
320   totalrows = 0;
321   rows_pos  = 0;
322   /* use this code again */
323   for (i=0;i<nfrom;i++){
324     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
325     for (j=0; j<fromsizes[2*i]; j+=2){
326       is_id                       = fromrows[rows_pos++];
327       rows_i                      = rows_data[is_id];
328       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
329     }
330     /* add data  */
331     for (j=0; j<nidx; j++){
332       if (!indv_counts[i*nidx+j]) continue;
333       indvc_ij = 0;
334       sbdata[totalrows++] = j;
335       sbdata[totalrows++] = indv_counts[i*nidx+j];
336       sbsizes[2*i]       += 2;
337       rows_i              = rows_data[j];
338       for (l=0; l<rows_pos_i[j]; l++){
339         row   = rows_i[l]-rstart;
340         start = ai[row];
341         end   = ai[row+1];
342         for (k=start; k<end; k++){ /* Amat */
343           col = aj[k] + cstart;
344           indices_tmp[indvc_ij++] = col;
345         }
346         start = bi[row];
347         end   = bi[row+1];
348         for (k=start; k<end; k++) { /* Bmat */
349           col = gcols[bj[k]];
350           indices_tmp[indvc_ij++] = col;
351         }
352       }
353       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
354       sbsizes[2*i]  += indvc_ij;
355       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
356       totalrows += indvc_ij;
357     }
358   }
359   /* offsets */
360   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
361   for (i=0; i<nfrom; i++){
362     offsets[i+1]   = offsets[i] + sbsizes[2*i];
363     sbsizes[2*i+1] = offsets[i];
364   }
365   ierr = PetscFree(offsets);CHKERRQ(ierr);
366   if (sbrowsizes) *sbrowsizes = sbsizes;
367   if (sbrows) *sbrows = sbdata;
368   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
369   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
370   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
371   PetscFunctionReturn(0);
372 }
373 
374 #undef __FUNCT__
375 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
376 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
377 {
378   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
379   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
380   PetscInt          lsize,lsize_tmp,owner;
381   PetscMPIInt       rank;
382   Mat                   amat,bmat;
383   PetscBool         done;
384   PetscLayout       cmap,rmap;
385   MPI_Comm          comm;
386   PetscErrorCode    ierr;
387 
388   PetscFunctionBegin;
389   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
390   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
391   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
392   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
393   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
394   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
395   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
396   /*is it a safe way to compute number of nonzero values ?*/
397   tnz  = ai[an]+bi[bn];
398   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
399   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
400   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
401   /*it is a better way to estimate memory than the old implementation
402    * where global size of matrix is used
403    * */
404   ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr);
405   for (i=0; i<nidx; i++) {
406     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
407     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
408     lsize_tmp = 0;
409     for (j=0; j<lsize; j++) {
410       owner = -1;
411       row   = indices[j];
412       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
413       if (owner != rank) continue;
414       /*local number*/
415       row  -= rstart;
416       start = ai[row];
417       end   = ai[row+1];
418       for (k=start; k<end; k++) { /* Amat */
419         col = aj[k] + cstart;
420         indices_temp[lsize_tmp++] = col;
421       }
422       start = bi[row];
423       end   = bi[row+1];
424       for (k=start; k<end; k++) { /* Bmat */
425         col = gcols[bj[k]];
426         indices_temp[lsize_tmp++] = col;
427       }
428     }
429    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
430    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
431    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
432    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
433   }
434   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
435   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
436   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
437   PetscFunctionReturn(0);
438 }
439 
440 
441 /*
442   Sample message format:
443   If a processor A wants processor B to process some elements corresponding
444   to index sets is[1],is[5]
445   mesg [0] = 2   (no of index sets in the mesg)
446   -----------
447   mesg [1] = 1 => is[1]
448   mesg [2] = sizeof(is[1]);
449   -----------
450   mesg [3] = 5  => is[5]
451   mesg [4] = sizeof(is[5]);
452   -----------
453   mesg [5]
454   mesg [n]  datas[1]
455   -----------
456   mesg[n+1]
457   mesg[m]  data(is[5])
458   -----------
459 
460   Notes:
461   nrqs - no of requests sent (or to be sent out)
462   nrqr - no of requests recieved (which have to be or which have been processed
463 */
464 #undef __FUNCT__
465 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
466 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
467 {
468   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
469   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
470   const PetscInt **idx,*idx_i;
471   PetscInt       *n,**data,len;
472   PetscErrorCode ierr;
473   PetscMPIInt    size,rank,tag1,tag2;
474   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
475   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
476   PetscBT        *table;
477   MPI_Comm       comm;
478   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
479   MPI_Status     *s_status,*recv_status;
480   char           *t_p;
481 
482   PetscFunctionBegin;
483   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
484   size = c->size;
485   rank = c->rank;
486   M    = C->rmap->N;
487 
488   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
489   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
490 
491   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
492 
493   for (i=0; i<imax; i++) {
494     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
495     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
496   }
497 
498   /* evaluate communication - mesg to who,length of mesg, and buffer space
499      required. Based on this, buffers are allocated, and data copied into them*/
500   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
501   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
502   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
503   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
504   for (i=0; i<imax; i++) {
505     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
506     idx_i = idx[i];
507     len   = n[i];
508     for (j=0; j<len; j++) {
509       row = idx_i[j];
510       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
511       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
512       w4[proc]++;
513     }
514     for (j=0; j<size; j++) {
515       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
516     }
517   }
518 
519   nrqs     = 0;              /* no of outgoing messages */
520   msz      = 0;              /* total mesg length (for all proc */
521   w1[rank] = 0;              /* no mesg sent to intself */
522   w3[rank] = 0;
523   for (i=0; i<size; i++) {
524     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
525   }
526   /* pa - is list of processors to communicate with */
527   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
528   for (i=0,j=0; i<size; i++) {
529     if (w1[i]) {pa[j] = i; j++;}
530   }
531 
532   /* Each message would have a header = 1 + 2*(no of IS) + data */
533   for (i=0; i<nrqs; i++) {
534     j      = pa[i];
535     w1[j] += w2[j] + 2*w3[j];
536     msz   += w1[j];
537   }
538 
539   /* Determine the number of messages to expect, their lengths, from from-ids */
540   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
541   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
542 
543   /* Now post the Irecvs corresponding to these messages */
544   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
545 
546   /* Allocate Memory for outgoing messages */
547   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
548   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
549   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
550 
551   {
552     PetscInt *iptr = tmp,ict  = 0;
553     for (i=0; i<nrqs; i++) {
554       j         = pa[i];
555       iptr     +=  ict;
556       outdat[j] = iptr;
557       ict       = w1[j];
558     }
559   }
560 
561   /* Form the outgoing messages */
562   /*plug in the headers*/
563   for (i=0; i<nrqs; i++) {
564     j            = pa[i];
565     outdat[j][0] = 0;
566     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
567     ptr[j]       = outdat[j] + 2*w3[j] + 1;
568   }
569 
570   /* Memory for doing local proc's work*/
571   {
572     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
573 
574     for (i=0; i<imax; i++) {
575       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
576       data[i]  = d_p + M*i;
577     }
578   }
579 
580   /* Parse the IS and update local tables and the outgoing buf with the data*/
581   {
582     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
583     PetscBT  table_i;
584 
585     for (i=0; i<imax; i++) {
586       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
587       n_i     = n[i];
588       table_i = table[i];
589       idx_i   = idx[i];
590       data_i  = data[i];
591       isz_i   = isz[i];
592       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
593         row  = idx_i[j];
594         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
595         if (proc != rank) { /* copy to the outgoing buffer */
596           ctr[proc]++;
597           *ptr[proc] = row;
598           ptr[proc]++;
599         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
600       }
601       /* Update the headers for the current IS */
602       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
603         if ((ctr_j = ctr[j])) {
604           outdat_j        = outdat[j];
605           k               = ++outdat_j[0];
606           outdat_j[2*k]   = ctr_j;
607           outdat_j[2*k-1] = i;
608         }
609       }
610       isz[i] = isz_i;
611     }
612   }
613 
614   /*  Now  post the sends */
615   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
616   for (i=0; i<nrqs; ++i) {
617     j    = pa[i];
618     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
619   }
620 
621   /* No longer need the original indices*/
622   for (i=0; i<imax; ++i) {
623     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
624   }
625   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
626 
627   for (i=0; i<imax; ++i) {
628     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
629   }
630 
631   /* Do Local work*/
632   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
633 
634   /* Receive messages*/
635   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
636   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
637 
638   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
639   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
640 
641   /* Phase 1 sends are complete - deallocate buffers */
642   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
643   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
644 
645   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
646   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
647   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
648   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
649   ierr = PetscFree(rbuf);CHKERRQ(ierr);
650 
651 
652   /* Send the data back*/
653   /* Do a global reduction to know the buffer space req for incoming messages*/
654   {
655     PetscMPIInt *rw1;
656 
657     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
658 
659     for (i=0; i<nrqr; ++i) {
660       proc = recv_status[i].MPI_SOURCE;
661 
662       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
663       rw1[proc] = isz1[i];
664     }
665     ierr = PetscFree(onodes1);CHKERRQ(ierr);
666     ierr = PetscFree(olengths1);CHKERRQ(ierr);
667 
668     /* Determine the number of messages to expect, their lengths, from from-ids */
669     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
670     ierr = PetscFree(rw1);CHKERRQ(ierr);
671   }
672   /* Now post the Irecvs corresponding to these messages */
673   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
674 
675   /*  Now  post the sends */
676   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
677   for (i=0; i<nrqr; ++i) {
678     j    = recv_status[i].MPI_SOURCE;
679     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
680   }
681 
682   /* receive work done on other processors*/
683   {
684     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
685     PetscMPIInt idex;
686     PetscBT     table_i;
687     MPI_Status  *status2;
688 
689     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
690     for (i=0; i<nrqs; ++i) {
691       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
692       /* Process the message*/
693       rbuf2_i = rbuf2[idex];
694       ct1     = 2*rbuf2_i[0]+1;
695       jmax    = rbuf2[idex][0];
696       for (j=1; j<=jmax; j++) {
697         max     = rbuf2_i[2*j];
698         is_no   = rbuf2_i[2*j-1];
699         isz_i   = isz[is_no];
700         data_i  = data[is_no];
701         table_i = table[is_no];
702         for (k=0; k<max; k++,ct1++) {
703           row = rbuf2_i[ct1];
704           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
705         }
706         isz[is_no] = isz_i;
707       }
708     }
709 
710     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
711     ierr = PetscFree(status2);CHKERRQ(ierr);
712   }
713 
714   for (i=0; i<imax; ++i) {
715     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
716   }
717 
718   ierr = PetscFree(onodes2);CHKERRQ(ierr);
719   ierr = PetscFree(olengths2);CHKERRQ(ierr);
720 
721   ierr = PetscFree(pa);CHKERRQ(ierr);
722   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
723   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
724   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
725   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
726   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
727   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
728   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
729   ierr = PetscFree(s_status);CHKERRQ(ierr);
730   ierr = PetscFree(recv_status);CHKERRQ(ierr);
731   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
732   ierr = PetscFree(xdata);CHKERRQ(ierr);
733   ierr = PetscFree(isz1);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
739 /*
740    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
741        the work on the local processor.
742 
743      Inputs:
744       C      - MAT_MPIAIJ;
745       imax - total no of index sets processed at a time;
746       table  - an array of char - size = m bits.
747 
748      Output:
749       isz    - array containing the count of the solution elements corresponding
750                to each index set;
751       data   - pointer to the solutions
752 */
753 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
754 {
755   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
756   Mat        A  = c->A,B = c->B;
757   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
758   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
759   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
760   PetscBT    table_i;
761 
762   PetscFunctionBegin;
763   rstart = C->rmap->rstart;
764   cstart = C->cmap->rstart;
765   ai     = a->i;
766   aj     = a->j;
767   bi     = b->i;
768   bj     = b->j;
769   garray = c->garray;
770 
771 
772   for (i=0; i<imax; i++) {
773     data_i  = data[i];
774     table_i = table[i];
775     isz_i   = isz[i];
776     for (j=0,max=isz[i]; j<max; j++) {
777       row   = data_i[j] - rstart;
778       start = ai[row];
779       end   = ai[row+1];
780       for (k=start; k<end; k++) { /* Amat */
781         val = aj[k] + cstart;
782         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
783       }
784       start = bi[row];
785       end   = bi[row+1];
786       for (k=start; k<end; k++) { /* Bmat */
787         val = garray[bj[k]];
788         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
789       }
790     }
791     isz[i] = isz_i;
792   }
793   PetscFunctionReturn(0);
794 }
795 
796 #undef __FUNCT__
797 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
798 /*
799       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
800          and return the output
801 
802          Input:
803            C    - the matrix
804            nrqr - no of messages being processed.
805            rbuf - an array of pointers to the recieved requests
806 
807          Output:
808            xdata - array of messages to be sent back
809            isz1  - size of each message
810 
811   For better efficiency perhaps we should malloc separately each xdata[i],
812 then if a remalloc is required we need only copy the data for that one row
813 rather then all previous rows as it is now where a single large chunck of
814 memory is used.
815 
816 */
817 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
818 {
819   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
820   Mat            A  = c->A,B = c->B;
821   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
822   PetscErrorCode ierr;
823   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
824   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
825   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
826   PetscInt       *rbuf_i,kmax,rbuf_0;
827   PetscBT        xtable;
828 
829   PetscFunctionBegin;
830   m      = C->rmap->N;
831   rstart = C->rmap->rstart;
832   cstart = C->cmap->rstart;
833   ai     = a->i;
834   aj     = a->j;
835   bi     = b->i;
836   bj     = b->j;
837   garray = c->garray;
838 
839 
840   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
841     rbuf_i =  rbuf[i];
842     rbuf_0 =  rbuf_i[0];
843     ct    += rbuf_0;
844     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
845   }
846 
847   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
848   else max1 = 1;
849   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
850   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
851   ++no_malloc;
852   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
853   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
854 
855   ct3 = 0;
856   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
857     rbuf_i =  rbuf[i];
858     rbuf_0 =  rbuf_i[0];
859     ct1    =  2*rbuf_0+1;
860     ct2    =  ct1;
861     ct3   += ct1;
862     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
863       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
864       oct2 = ct2;
865       kmax = rbuf_i[2*j];
866       for (k=0; k<kmax; k++,ct1++) {
867         row = rbuf_i[ct1];
868         if (!PetscBTLookupSet(xtable,row)) {
869           if (!(ct3 < mem_estimate)) {
870             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
871             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
872             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
873             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
874             xdata[0]     = tmp;
875             mem_estimate = new_estimate; ++no_malloc;
876             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
877           }
878           xdata[i][ct2++] = row;
879           ct3++;
880         }
881       }
882       for (k=oct2,max2=ct2; k<max2; k++) {
883         row   = xdata[i][k] - rstart;
884         start = ai[row];
885         end   = ai[row+1];
886         for (l=start; l<end; l++) {
887           val = aj[l] + cstart;
888           if (!PetscBTLookupSet(xtable,val)) {
889             if (!(ct3 < mem_estimate)) {
890               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
891               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
892               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
893               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
894               xdata[0]     = tmp;
895               mem_estimate = new_estimate; ++no_malloc;
896               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
897             }
898             xdata[i][ct2++] = val;
899             ct3++;
900           }
901         }
902         start = bi[row];
903         end   = bi[row+1];
904         for (l=start; l<end; l++) {
905           val = garray[bj[l]];
906           if (!PetscBTLookupSet(xtable,val)) {
907             if (!(ct3 < mem_estimate)) {
908               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
909               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
910               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
911               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
912               xdata[0]     = tmp;
913               mem_estimate = new_estimate; ++no_malloc;
914               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
915             }
916             xdata[i][ct2++] = val;
917             ct3++;
918           }
919         }
920       }
921       /* Update the header*/
922       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
923       xdata[i][2*j-1] = rbuf_i[2*j-1];
924     }
925     xdata[i][0] = rbuf_0;
926     xdata[i+1]  = xdata[i] + ct2;
927     isz1[i]     = ct2; /* size of each message */
928   }
929   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
930   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
931   PetscFunctionReturn(0);
932 }
933 /* -------------------------------------------------------------------------*/
934 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
935 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
936 /*
937     Every processor gets the entire matrix
938 */
939 #undef __FUNCT__
940 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
941 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
942 {
943   Mat            B;
944   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
945   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
946   PetscErrorCode ierr;
947   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
948   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
949   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
950   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
951 
952   PetscFunctionBegin;
953   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
954   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
955 
956   if (scall == MAT_INITIAL_MATRIX) {
957     /* ----------------------------------------------------------------
958          Tell every processor the number of nonzeros per row
959     */
960     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
961     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
962       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
963     }
964     sendcount = A->rmap->rend - A->rmap->rstart;
965     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
966     for (i=0; i<size; i++) {
967       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
968       displs[i]     = A->rmap->range[i];
969     }
970 #if defined(PETSC_HAVE_MPI_IN_PLACE)
971     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
972 #else
973     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
974 #endif
975     /* ---------------------------------------------------------------
976          Create the sequential matrix of the same type as the local block diagonal
977     */
978     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
979     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
980     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
981     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
982     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
983     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
984     **Bin = B;
985     b     = (Mat_SeqAIJ*)B->data;
986 
987     /*--------------------------------------------------------------------
988        Copy my part of matrix column indices over
989     */
990     sendcount  = ad->nz + bd->nz;
991     jsendbuf   = b->j + b->i[rstarts[rank]];
992     a_jsendbuf = ad->j;
993     b_jsendbuf = bd->j;
994     n          = A->rmap->rend - A->rmap->rstart;
995     cnt        = 0;
996     for (i=0; i<n; i++) {
997 
998       /* put in lower diagonal portion */
999       m = bd->i[i+1] - bd->i[i];
1000       while (m > 0) {
1001         /* is it above diagonal (in bd (compressed) numbering) */
1002         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1003         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1004         m--;
1005       }
1006 
1007       /* put in diagonal portion */
1008       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1009         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1010       }
1011 
1012       /* put in upper diagonal portion */
1013       while (m-- > 0) {
1014         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1015       }
1016     }
1017     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1018 
1019     /*--------------------------------------------------------------------
1020        Gather all column indices to all processors
1021     */
1022     for (i=0; i<size; i++) {
1023       recvcounts[i] = 0;
1024       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1025         recvcounts[i] += lens[j];
1026       }
1027     }
1028     displs[0] = 0;
1029     for (i=1; i<size; i++) {
1030       displs[i] = displs[i-1] + recvcounts[i-1];
1031     }
1032 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1033     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1034 #else
1035     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1036 #endif
1037     /*--------------------------------------------------------------------
1038         Assemble the matrix into useable form (note numerical values not yet set)
1039     */
1040     /* set the b->ilen (length of each row) values */
1041     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1042     /* set the b->i indices */
1043     b->i[0] = 0;
1044     for (i=1; i<=A->rmap->N; i++) {
1045       b->i[i] = b->i[i-1] + lens[i-1];
1046     }
1047     ierr = PetscFree(lens);CHKERRQ(ierr);
1048     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1049     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1050 
1051   } else {
1052     B = **Bin;
1053     b = (Mat_SeqAIJ*)B->data;
1054   }
1055 
1056   /*--------------------------------------------------------------------
1057        Copy my part of matrix numerical values into the values location
1058   */
1059   if (flag == MAT_GET_VALUES) {
1060     sendcount = ad->nz + bd->nz;
1061     sendbuf   = b->a + b->i[rstarts[rank]];
1062     a_sendbuf = ad->a;
1063     b_sendbuf = bd->a;
1064     b_sendj   = bd->j;
1065     n         = A->rmap->rend - A->rmap->rstart;
1066     cnt       = 0;
1067     for (i=0; i<n; i++) {
1068 
1069       /* put in lower diagonal portion */
1070       m = bd->i[i+1] - bd->i[i];
1071       while (m > 0) {
1072         /* is it above diagonal (in bd (compressed) numbering) */
1073         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1074         sendbuf[cnt++] = *b_sendbuf++;
1075         m--;
1076         b_sendj++;
1077       }
1078 
1079       /* put in diagonal portion */
1080       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1081         sendbuf[cnt++] = *a_sendbuf++;
1082       }
1083 
1084       /* put in upper diagonal portion */
1085       while (m-- > 0) {
1086         sendbuf[cnt++] = *b_sendbuf++;
1087         b_sendj++;
1088       }
1089     }
1090     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1091 
1092     /* -----------------------------------------------------------------
1093        Gather all numerical values to all processors
1094     */
1095     if (!recvcounts) {
1096       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1097     }
1098     for (i=0; i<size; i++) {
1099       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1100     }
1101     displs[0] = 0;
1102     for (i=1; i<size; i++) {
1103       displs[i] = displs[i-1] + recvcounts[i-1];
1104     }
1105     recvbuf = b->a;
1106 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1107     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1108 #else
1109     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1110 #endif
1111   }  /* endof (flag == MAT_GET_VALUES) */
1112   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1113 
1114   if (A->symmetric) {
1115     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1116   } else if (A->hermitian) {
1117     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1118   } else if (A->structurally_symmetric) {
1119     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1120   }
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 
1125 
1126 #undef __FUNCT__
1127 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
1128 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1129 {
1130   PetscErrorCode ierr;
1131   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1132   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1133 
1134   PetscFunctionBegin;
1135 
1136   /*
1137        Check for special case: each processor gets entire matrix
1138   */
1139   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1140     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1141     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1142     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1143     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1144     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1145       wantallmatrix = PETSC_TRUE;
1146 
1147       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1148     }
1149   }
1150   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1151   if (twantallmatrix) {
1152     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
1153     PetscFunctionReturn(0);
1154   }
1155 
1156   /* Allocate memory to hold all the submatrices */
1157   if (scall != MAT_REUSE_MATRIX) {
1158     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
1159   }
1160 
1161   /* Check for special case: each processor gets entire matrix columns */
1162   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
1163   for (i=0; i<ismax; i++) {
1164     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
1165     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
1166     if (colflag && ncol == C->cmap->N) {
1167       allcolumns[i] = PETSC_TRUE;
1168     } else {
1169       allcolumns[i] = PETSC_FALSE;
1170     }
1171   }
1172 
1173   /* Determine the number of stages through which submatrices are done */
1174   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1175 
1176   /*
1177      Each stage will extract nmax submatrices.
1178      nmax is determined by the matrix column dimension.
1179      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1180   */
1181   if (!nmax) nmax = 1;
1182   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1183 
1184   /* Make sure every processor loops through the nstages */
1185   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1186 
1187   for (i=0,pos=0; i<nstages; i++) {
1188     if (pos+nmax <= ismax) max_no = nmax;
1189     else if (pos == ismax) max_no = 0;
1190     else                   max_no = ismax-pos;
1191     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
1192     pos += max_no;
1193   }
1194 
1195   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 /* -------------------------------------------------------------------------*/
1200 #undef __FUNCT__
1201 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
1202 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1203 {
1204   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1205   Mat            A  = c->A;
1206   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1207   const PetscInt **icol,**irow;
1208   PetscInt       *nrow,*ncol,start;
1209   PetscErrorCode ierr;
1210   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1211   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1212   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1213   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1214   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1215 #if defined(PETSC_USE_CTABLE)
1216   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1217 #else
1218   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1219 #endif
1220   const PetscInt *irow_i;
1221   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1222   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1223   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1224   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1225   MPI_Status     *r_status3,*r_status4,*s_status4;
1226   MPI_Comm       comm;
1227   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1228   PetscMPIInt    *onodes1,*olengths1;
1229   PetscMPIInt    idex,idex2,end;
1230 
1231   PetscFunctionBegin;
1232   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1233   tag0 = ((PetscObject)C)->tag;
1234   size = c->size;
1235   rank = c->rank;
1236 
1237   /* Get some new tags to keep the communication clean */
1238   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1239   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1240   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1241 
1242   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
1243 
1244   for (i=0; i<ismax; i++) {
1245     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
1246     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
1247     if (allcolumns[i]) {
1248       icol[i] = NULL;
1249       ncol[i] = C->cmap->N;
1250     } else {
1251       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
1252       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
1253     }
1254   }
1255 
1256   /* evaluate communication - mesg to who, length of mesg, and buffer space
1257      required. Based on this, buffers are allocated, and data copied into them*/
1258   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
1259   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1260   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1261   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1262   for (i=0; i<ismax; i++) {
1263     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
1264     jmax   = nrow[i];
1265     irow_i = irow[i];
1266     for (j=0; j<jmax; j++) {
1267       l   = 0;
1268       row = irow_i[j];
1269       while (row >= C->rmap->range[l+1]) l++;
1270       proc = l;
1271       w4[proc]++;
1272     }
1273     for (j=0; j<size; j++) {
1274       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1275     }
1276   }
1277 
1278   nrqs     = 0;              /* no of outgoing messages */
1279   msz      = 0;              /* total mesg length (for all procs) */
1280   w1[rank] = 0;              /* no mesg sent to self */
1281   w3[rank] = 0;
1282   for (i=0; i<size; i++) {
1283     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1284   }
1285   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1286   for (i=0,j=0; i<size; i++) {
1287     if (w1[i]) { pa[j] = i; j++; }
1288   }
1289 
1290   /* Each message would have a header = 1 + 2*(no of IS) + data */
1291   for (i=0; i<nrqs; i++) {
1292     j      = pa[i];
1293     w1[j] += w2[j] + 2* w3[j];
1294     msz   += w1[j];
1295   }
1296   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1297 
1298   /* Determine the number of messages to expect, their lengths, from from-ids */
1299   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1300   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1301 
1302   /* Now post the Irecvs corresponding to these messages */
1303   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1304 
1305   ierr = PetscFree(onodes1);CHKERRQ(ierr);
1306   ierr = PetscFree(olengths1);CHKERRQ(ierr);
1307 
1308   /* Allocate Memory for outgoing messages */
1309   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1310   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1311   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1312 
1313   {
1314     PetscInt *iptr = tmp,ict = 0;
1315     for (i=0; i<nrqs; i++) {
1316       j        = pa[i];
1317       iptr    += ict;
1318       sbuf1[j] = iptr;
1319       ict      = w1[j];
1320     }
1321   }
1322 
1323   /* Form the outgoing messages */
1324   /* Initialize the header space */
1325   for (i=0; i<nrqs; i++) {
1326     j           = pa[i];
1327     sbuf1[j][0] = 0;
1328     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
1329     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1330   }
1331 
1332   /* Parse the isrow and copy data into outbuf */
1333   for (i=0; i<ismax; i++) {
1334     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1335     irow_i = irow[i];
1336     jmax   = nrow[i];
1337     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1338       l   = 0;
1339       row = irow_i[j];
1340       while (row >= C->rmap->range[l+1]) l++;
1341       proc = l;
1342       if (proc != rank) { /* copy to the outgoing buf*/
1343         ctr[proc]++;
1344         *ptr[proc] = row;
1345         ptr[proc]++;
1346       }
1347     }
1348     /* Update the headers for the current IS */
1349     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1350       if ((ctr_j = ctr[j])) {
1351         sbuf1_j        = sbuf1[j];
1352         k              = ++sbuf1_j[0];
1353         sbuf1_j[2*k]   = ctr_j;
1354         sbuf1_j[2*k-1] = i;
1355       }
1356     }
1357   }
1358 
1359   /*  Now  post the sends */
1360   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1361   for (i=0; i<nrqs; ++i) {
1362     j    = pa[i];
1363     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
1364   }
1365 
1366   /* Post Receives to capture the buffer size */
1367   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
1368   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
1369   rbuf2[0] = tmp + msz;
1370   for (i=1; i<nrqs; ++i) {
1371     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1372   }
1373   for (i=0; i<nrqs; ++i) {
1374     j    = pa[i];
1375     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
1376   }
1377 
1378   /* Send to other procs the buf size they should allocate */
1379 
1380 
1381   /* Receive messages*/
1382   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
1383   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1384   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
1385   {
1386     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1387     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1388     PetscInt   *sbuf2_i;
1389 
1390     for (i=0; i<nrqr; ++i) {
1391       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1392 
1393       req_size[idex] = 0;
1394       rbuf1_i        = rbuf1[idex];
1395       start          = 2*rbuf1_i[0] + 1;
1396       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1397       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
1398       sbuf2_i        = sbuf2[idex];
1399       for (j=start; j<end; j++) {
1400         id              = rbuf1_i[j] - rstart;
1401         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1402         sbuf2_i[j]      = ncols;
1403         req_size[idex] += ncols;
1404       }
1405       req_source[idex] = r_status1[i].MPI_SOURCE;
1406       /* form the header */
1407       sbuf2_i[0] = req_size[idex];
1408       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1409 
1410       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1411     }
1412   }
1413   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1414   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1415 
1416   /*  recv buffer sizes */
1417   /* Receive messages*/
1418 
1419   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1420   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1421   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1422   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1423   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1424 
1425   for (i=0; i<nrqs; ++i) {
1426     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1427     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1428     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1429     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1430     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1431   }
1432   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1433   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1434 
1435   /* Wait on sends1 and sends2 */
1436   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1437   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1438 
1439   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1440   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1441   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1442   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1443   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1444   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1445 
1446   /* Now allocate buffers for a->j, and send them off */
1447   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1448   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1449   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1450   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1451 
1452   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1453   {
1454     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1455     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1456     PetscInt cend = C->cmap->rend;
1457     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1458 
1459     for (i=0; i<nrqr; i++) {
1460       rbuf1_i   = rbuf1[i];
1461       sbuf_aj_i = sbuf_aj[i];
1462       ct1       = 2*rbuf1_i[0] + 1;
1463       ct2       = 0;
1464       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1465         kmax = rbuf1[i][2*j];
1466         for (k=0; k<kmax; k++,ct1++) {
1467           row    = rbuf1_i[ct1] - rstart;
1468           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1469           ncols  = nzA + nzB;
1470           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1471 
1472           /* load the column indices for this row into cols*/
1473           cols = sbuf_aj_i + ct2;
1474 
1475           lwrite = 0;
1476           for (l=0; l<nzB; l++) {
1477             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1478           }
1479           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1480           for (l=0; l<nzB; l++) {
1481             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1482           }
1483 
1484           ct2 += ncols;
1485         }
1486       }
1487       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1488     }
1489   }
1490   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1491   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1492 
1493   /* Allocate buffers for a->a, and send them off */
1494   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1495   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1496   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1497   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1498 
1499   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1500   {
1501     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1502     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1503     PetscInt    cend   = C->cmap->rend;
1504     PetscInt    *b_j   = b->j;
1505     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1506 
1507     for (i=0; i<nrqr; i++) {
1508       rbuf1_i   = rbuf1[i];
1509       sbuf_aa_i = sbuf_aa[i];
1510       ct1       = 2*rbuf1_i[0]+1;
1511       ct2       = 0;
1512       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1513         kmax = rbuf1_i[2*j];
1514         for (k=0; k<kmax; k++,ct1++) {
1515           row    = rbuf1_i[ct1] - rstart;
1516           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1517           ncols  = nzA + nzB;
1518           cworkB = b_j + b_i[row];
1519           vworkA = a_a + a_i[row];
1520           vworkB = b_a + b_i[row];
1521 
1522           /* load the column values for this row into vals*/
1523           vals = sbuf_aa_i+ct2;
1524 
1525           lwrite = 0;
1526           for (l=0; l<nzB; l++) {
1527             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1528           }
1529           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1530           for (l=0; l<nzB; l++) {
1531             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1532           }
1533 
1534           ct2 += ncols;
1535         }
1536       }
1537       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1538     }
1539   }
1540   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1541   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1542   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1543   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1544 
1545   /* Form the matrix */
1546   /* create col map: global col of C -> local col of submatrices */
1547   {
1548     const PetscInt *icol_i;
1549 #if defined(PETSC_USE_CTABLE)
1550     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1551     for (i=0; i<ismax; i++) {
1552       if (!allcolumns[i]) {
1553         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1554 
1555         jmax   = ncol[i];
1556         icol_i = icol[i];
1557         cmap_i = cmap[i];
1558         for (j=0; j<jmax; j++) {
1559           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1560         }
1561       } else {
1562         cmap[i] = NULL;
1563       }
1564     }
1565 #else
1566     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1567     for (i=0; i<ismax; i++) {
1568       if (!allcolumns[i]) {
1569         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1570         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1571         jmax   = ncol[i];
1572         icol_i = icol[i];
1573         cmap_i = cmap[i];
1574         for (j=0; j<jmax; j++) {
1575           cmap_i[icol_i[j]] = j+1;
1576         }
1577       } else {
1578         cmap[i] = NULL;
1579       }
1580     }
1581 #endif
1582   }
1583 
1584   /* Create lens which is required for MatCreate... */
1585   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1586   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1587   if (ismax) {
1588     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1589     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1590   }
1591   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1592 
1593   /* Update lens from local data */
1594   for (i=0; i<ismax; i++) {
1595     jmax = nrow[i];
1596     if (!allcolumns[i]) cmap_i = cmap[i];
1597     irow_i = irow[i];
1598     lens_i = lens[i];
1599     for (j=0; j<jmax; j++) {
1600       l   = 0;
1601       row = irow_i[j];
1602       while (row >= C->rmap->range[l+1]) l++;
1603       proc = l;
1604       if (proc == rank) {
1605         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1606         if (!allcolumns[i]) {
1607           for (k=0; k<ncols; k++) {
1608 #if defined(PETSC_USE_CTABLE)
1609             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1610 #else
1611             tcol = cmap_i[cols[k]];
1612 #endif
1613             if (tcol) lens_i[j]++;
1614           }
1615         } else { /* allcolumns */
1616           lens_i[j] = ncols;
1617         }
1618         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1619       }
1620     }
1621   }
1622 
1623   /* Create row map: global row of C -> local row of submatrices */
1624 #if defined(PETSC_USE_CTABLE)
1625   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1626   for (i=0; i<ismax; i++) {
1627     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1628     rmap_i = rmap[i];
1629     irow_i = irow[i];
1630     jmax   = nrow[i];
1631     for (j=0; j<jmax; j++) {
1632       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1633     }
1634   }
1635 #else
1636   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1637   if (ismax) {
1638     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1639     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1640   }
1641   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1642   for (i=0; i<ismax; i++) {
1643     rmap_i = rmap[i];
1644     irow_i = irow[i];
1645     jmax   = nrow[i];
1646     for (j=0; j<jmax; j++) {
1647       rmap_i[irow_i[j]] = j;
1648     }
1649   }
1650 #endif
1651 
1652   /* Update lens from offproc data */
1653   {
1654     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1655 
1656     for (tmp2=0; tmp2<nrqs; tmp2++) {
1657       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1658       idex    = pa[idex2];
1659       sbuf1_i = sbuf1[idex];
1660       jmax    = sbuf1_i[0];
1661       ct1     = 2*jmax+1;
1662       ct2     = 0;
1663       rbuf2_i = rbuf2[idex2];
1664       rbuf3_i = rbuf3[idex2];
1665       for (j=1; j<=jmax; j++) {
1666         is_no  = sbuf1_i[2*j-1];
1667         max1   = sbuf1_i[2*j];
1668         lens_i = lens[is_no];
1669         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1670         rmap_i = rmap[is_no];
1671         for (k=0; k<max1; k++,ct1++) {
1672 #if defined(PETSC_USE_CTABLE)
1673           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1674           row--;
1675           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1676 #else
1677           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1678 #endif
1679           max2 = rbuf2_i[ct1];
1680           for (l=0; l<max2; l++,ct2++) {
1681             if (!allcolumns[is_no]) {
1682 #if defined(PETSC_USE_CTABLE)
1683               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1684 #else
1685               tcol = cmap_i[rbuf3_i[ct2]];
1686 #endif
1687               if (tcol) lens_i[row]++;
1688             } else { /* allcolumns */
1689               lens_i[row]++; /* lens_i[row] += max2 ? */
1690             }
1691           }
1692         }
1693       }
1694     }
1695   }
1696   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1697   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1698   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1699   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1700   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1701 
1702   /* Create the submatrices */
1703   if (scall == MAT_REUSE_MATRIX) {
1704     PetscBool flag;
1705 
1706     /*
1707         Assumes new rows are same length as the old rows,hence bug!
1708     */
1709     for (i=0; i<ismax; i++) {
1710       mat = (Mat_SeqAIJ*)(submats[i]->data);
1711       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1712       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1713       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1714       /* Initial matrix as if empty */
1715       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1716 
1717       submats[i]->factortype = C->factortype;
1718     }
1719   } else {
1720     for (i=0; i<ismax; i++) {
1721       PetscInt rbs,cbs;
1722       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1723       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1724 
1725       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1726       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1727 
1728       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1729       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1730       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1731     }
1732   }
1733 
1734   /* Assemble the matrices */
1735   /* First assemble the local rows */
1736   {
1737     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1738     PetscScalar *imat_a;
1739 
1740     for (i=0; i<ismax; i++) {
1741       mat       = (Mat_SeqAIJ*)submats[i]->data;
1742       imat_ilen = mat->ilen;
1743       imat_j    = mat->j;
1744       imat_i    = mat->i;
1745       imat_a    = mat->a;
1746 
1747       if (!allcolumns[i]) cmap_i = cmap[i];
1748       rmap_i = rmap[i];
1749       irow_i = irow[i];
1750       jmax   = nrow[i];
1751       for (j=0; j<jmax; j++) {
1752         l   = 0;
1753         row = irow_i[j];
1754         while (row >= C->rmap->range[l+1]) l++;
1755         proc = l;
1756         if (proc == rank) {
1757           old_row = row;
1758 #if defined(PETSC_USE_CTABLE)
1759           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1760           row--;
1761 #else
1762           row = rmap_i[row];
1763 #endif
1764           ilen_row = imat_ilen[row];
1765           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1766           mat_i    = imat_i[row];
1767           mat_a    = imat_a + mat_i;
1768           mat_j    = imat_j + mat_i;
1769           if (!allcolumns[i]) {
1770             for (k=0; k<ncols; k++) {
1771 #if defined(PETSC_USE_CTABLE)
1772               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1773 #else
1774               tcol = cmap_i[cols[k]];
1775 #endif
1776               if (tcol) {
1777                 *mat_j++ = tcol - 1;
1778                 *mat_a++ = vals[k];
1779                 ilen_row++;
1780               }
1781             }
1782           } else { /* allcolumns */
1783             for (k=0; k<ncols; k++) {
1784               *mat_j++ = cols[k];  /* global col index! */
1785               *mat_a++ = vals[k];
1786               ilen_row++;
1787             }
1788           }
1789           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1790 
1791           imat_ilen[row] = ilen_row;
1792         }
1793       }
1794     }
1795   }
1796 
1797   /*   Now assemble the off proc rows*/
1798   {
1799     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1800     PetscInt    *imat_j,*imat_i;
1801     PetscScalar *imat_a,*rbuf4_i;
1802 
1803     for (tmp2=0; tmp2<nrqs; tmp2++) {
1804       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1805       idex    = pa[idex2];
1806       sbuf1_i = sbuf1[idex];
1807       jmax    = sbuf1_i[0];
1808       ct1     = 2*jmax + 1;
1809       ct2     = 0;
1810       rbuf2_i = rbuf2[idex2];
1811       rbuf3_i = rbuf3[idex2];
1812       rbuf4_i = rbuf4[idex2];
1813       for (j=1; j<=jmax; j++) {
1814         is_no     = sbuf1_i[2*j-1];
1815         rmap_i    = rmap[is_no];
1816         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1817         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1818         imat_ilen = mat->ilen;
1819         imat_j    = mat->j;
1820         imat_i    = mat->i;
1821         imat_a    = mat->a;
1822         max1      = sbuf1_i[2*j];
1823         for (k=0; k<max1; k++,ct1++) {
1824           row = sbuf1_i[ct1];
1825 #if defined(PETSC_USE_CTABLE)
1826           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1827           row--;
1828 #else
1829           row = rmap_i[row];
1830 #endif
1831           ilen  = imat_ilen[row];
1832           mat_i = imat_i[row];
1833           mat_a = imat_a + mat_i;
1834           mat_j = imat_j + mat_i;
1835           max2  = rbuf2_i[ct1];
1836           if (!allcolumns[is_no]) {
1837             for (l=0; l<max2; l++,ct2++) {
1838 
1839 #if defined(PETSC_USE_CTABLE)
1840               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1841 #else
1842               tcol = cmap_i[rbuf3_i[ct2]];
1843 #endif
1844               if (tcol) {
1845                 *mat_j++ = tcol - 1;
1846                 *mat_a++ = rbuf4_i[ct2];
1847                 ilen++;
1848               }
1849             }
1850           } else { /* allcolumns */
1851             for (l=0; l<max2; l++,ct2++) {
1852               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1853               *mat_a++ = rbuf4_i[ct2];
1854               ilen++;
1855             }
1856           }
1857           imat_ilen[row] = ilen;
1858         }
1859       }
1860     }
1861   }
1862 
1863   /* sort the rows */
1864   {
1865     PetscInt    *imat_ilen,*imat_j,*imat_i;
1866     PetscScalar *imat_a;
1867 
1868     for (i=0; i<ismax; i++) {
1869       mat       = (Mat_SeqAIJ*)submats[i]->data;
1870       imat_j    = mat->j;
1871       imat_i    = mat->i;
1872       imat_a    = mat->a;
1873       imat_ilen = mat->ilen;
1874 
1875       if (allcolumns[i]) continue;
1876       jmax = nrow[i];
1877       for (j=0; j<jmax; j++) {
1878         PetscInt ilen;
1879 
1880         mat_i = imat_i[j];
1881         mat_a = imat_a + mat_i;
1882         mat_j = imat_j + mat_i;
1883         ilen  = imat_ilen[j];
1884         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
1885       }
1886     }
1887   }
1888 
1889   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1890   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1891   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1892   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1893   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1894 
1895   /* Restore the indices */
1896   for (i=0; i<ismax; i++) {
1897     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1898     if (!allcolumns[i]) {
1899       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1900     }
1901   }
1902 
1903   /* Destroy allocated memory */
1904   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1905   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1906   ierr = PetscFree(pa);CHKERRQ(ierr);
1907 
1908   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1909   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1910   for (i=0; i<nrqr; ++i) {
1911     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1912   }
1913   for (i=0; i<nrqs; ++i) {
1914     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1915     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1916   }
1917 
1918   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1919   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1920   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1921   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1922   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1923   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1924   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1925 
1926 #if defined(PETSC_USE_CTABLE)
1927   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1928 #else
1929   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1930 #endif
1931   ierr = PetscFree(rmap);CHKERRQ(ierr);
1932 
1933   for (i=0; i<ismax; i++) {
1934     if (!allcolumns[i]) {
1935 #if defined(PETSC_USE_CTABLE)
1936       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1937 #else
1938       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1939 #endif
1940     }
1941   }
1942   ierr = PetscFree(cmap);CHKERRQ(ierr);
1943   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1944   ierr = PetscFree(lens);CHKERRQ(ierr);
1945 
1946   for (i=0; i<ismax; i++) {
1947     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1948     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1949   }
1950   PetscFunctionReturn(0);
1951 }
1952 
1953 /*
1954  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1955  Be careful not to destroy them elsewhere.
1956  */
1957 #undef __FUNCT__
1958 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1959 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1960 {
1961   /* If making this function public, change the error returned in this function away from _PLIB. */
1962   PetscErrorCode ierr;
1963   Mat_MPIAIJ     *aij;
1964   PetscBool      seqaij;
1965 
1966   PetscFunctionBegin;
1967   /* Check to make sure the component matrices are compatible with C. */
1968   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1969   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1970   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1971   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1972   if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1973 
1974   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1975   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1976   ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr);
1977   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1978   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1979   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1980   ierr   = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1981   aij    = (Mat_MPIAIJ*)((*C)->data);
1982   aij->A = A;
1983   aij->B = B;
1984   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr);
1985   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr);
1986 
1987   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1988   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1989   PetscFunctionReturn(0);
1990 }
1991 
1992 #undef __FUNCT__
1993 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1994 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1995 {
1996   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1997 
1998   PetscFunctionBegin;
1999   PetscValidPointer(A,2);
2000   PetscValidPointer(B,3);
2001   *A = aij->A;
2002   *B = aij->B;
2003   /* Note that we don't incref *A and *B, so be careful! */
2004   PetscFunctionReturn(0);
2005 }
2006 
2007 #undef __FUNCT__
2008 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
2009 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
2010                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
2011                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
2012                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
2013 {
2014   PetscErrorCode ierr;
2015   PetscMPIInt    size, flag;
2016   PetscInt       i,ii;
2017   PetscInt       ismax_c;
2018 
2019   PetscFunctionBegin;
2020   if (!ismax) PetscFunctionReturn(0);
2021 
2022   for (i = 0, ismax_c = 0; i < ismax; ++i) {
2023     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
2024     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2025     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2026     if (size > 1) ++ismax_c;
2027   }
2028   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
2029     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2030   } else { /* if (ismax_c) */
2031     Mat         *A,*B;
2032     IS          *isrow_c, *iscol_c;
2033     PetscMPIInt size;
2034     /*
2035      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2036      array of sequential matrices underlying the resulting parallel matrices.
2037      Which arrays to allocate is based on the value of MatReuse scall.
2038      There are as many diag matrices as there are original index sets.
2039      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
2040 
2041      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
2042      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
2043      will deposite the extracted diag and off-diag parts.
2044      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
2045      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
2046     */
2047 
2048     /* Parallel matrix array is allocated only if no reuse is taking place. */
2049     if (scall != MAT_REUSE_MATRIX) {
2050       ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
2051     } else {
2052       ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr);
2053       ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr);
2054       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
2055       for (i = 0, ii = 0; i < ismax; ++i) {
2056         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2057         if (size > 1) {
2058           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
2059           ++ii;
2060         } else A[i] = (*submat)[i];
2061       }
2062     }
2063     /*
2064      Construct the complements of the iscol ISs for parallel ISs only.
2065      These are used to extract the off-diag portion of the resulting parallel matrix.
2066      The row IS for the off-diag portion is the same as for the diag portion,
2067      so we merely alias the row IS, while skipping those that are sequential.
2068     */
2069     ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr);
2070     for (i = 0, ii = 0; i < ismax; ++i) {
2071       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2072       if (size > 1) {
2073         isrow_c[ii] = isrow[i];
2074 
2075         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
2076         ++ii;
2077       }
2078     }
2079     /* Now obtain the sequential A and B submatrices separately. */
2080     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
2081     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
2082     for (ii = 0; ii < ismax_c; ++ii) {
2083       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
2084     }
2085     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
2086     /*
2087      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
2088      have been extracted directly into the parallel matrices containing them, or
2089      simply into the sequential matrix identical with the corresponding A (if size == 1).
2090      Otherwise, make sure that parallel matrices are constructed from A & B, or the
2091      A is put into the correct submat slot (if size == 1).
2092      */
2093     if (scall != MAT_REUSE_MATRIX) {
2094       for (i = 0, ii = 0; i < ismax; ++i) {
2095         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2096         if (size > 1) {
2097           /*
2098            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
2099            */
2100           /* Construct submat[i] from the Seq pieces A and B. */
2101           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
2102 
2103           ++ii;
2104         } else (*submat)[i] = A[i];
2105       }
2106     }
2107     ierr = PetscFree(A);CHKERRQ(ierr);
2108     ierr = PetscFree(B);CHKERRQ(ierr);
2109   }
2110   PetscFunctionReturn(0);
2111 } /* MatGetSubMatricesParallel_MPIXAIJ() */
2112 
2113 
2114 
2115 #undef __FUNCT__
2116 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
2117 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2118 {
2119   PetscErrorCode ierr;
2120 
2121   PetscFunctionBegin;
2122   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
2123   PetscFunctionReturn(0);
2124 }
2125