xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision b1b1104fb3564ca1342e845fa7119e30388145b7)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscInt       i;
28 
29   PetscFunctionBegin;
30   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
31   for (i=0; i<ov; ++i) {
32     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
33   }
34   PetscFunctionReturn(0);
35 }
36 
37 #undef __FUNCT__
38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
40 {
41   PetscErrorCode ierr;
42   PetscInt       i;
43 
44   PetscFunctionBegin;
45   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
46   for (i=0; i<ov; ++i) {
47     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
48   }
49   PetscFunctionReturn(0);
50 }
51 
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
56 {
57   PetscErrorCode   ierr;
58   MPI_Comm         comm;
59   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
60   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
61   PetscInt         nrecvrows,*sbsizes,*sbdata,nto,nfrom;
62   const PetscInt **indices,*indices_i;
63   PetscLayout      rmap;
64   PetscMPIInt      rank,size,*toranks,*fromranks;
65   PetscSF          sf;
66   PetscSFNode     *remote;
67 
68   PetscFunctionBegin;
69   /*communicator */
70   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
71   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
72   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
73   /*row map*/
74   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
75   /*retrieve IS data*/
76   ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr);
77   /*get length and indices*/
78   for(i=0,tlength=0; i<nidx; i++){
79     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
80     tlength += length[i];
81     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
82   }
83   /*find these rows on remote processors */
84   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
85   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
86   nrrows = 0;
87   for(i=0; i<nidx; i++){
88 	length_i     = length[i];
89 	indices_i    = indices[i];
90     for(j=0; j<length_i; j++){
91       owner = -1;
92       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
93       /*remote processors*/
94       if(owner != rank){
95     	tosizes_temp[owner]++; /*number of rows to owner*/
96     	rrow_ranks[nrrows]  = owner; /*processor */
97     	rrow_isids[nrrows]   = i; /*is id*/
98         remoterows[nrrows++] = indices_i[j]; /* row */
99       }
100     }
101     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
102   }
103   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
104   /*test if we need to exchange messages
105    * generally speaking, we do not need to exchange
106    * data when overlap is 1
107    * */
108   ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
109   /*we do not have any messages
110    * It usually corresponds to overlap 1
111    * */
112   if(!reducednrrows){
113 	ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
114 	ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
115 	ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
116 	PetscFunctionReturn(0);
117   }
118   nto = 0;
119   /*send sizes and ranks*/
120   for(i=0; i<size; i++){
121    if(tosizes_temp[i]){
122     tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
123     tosizes_temp[i] = nto; /* a map from processor to index */
124 	toranks[nto++]  = i; /* processor */
125    }
126   }
127   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
128   for(i=0; i<nto; i++){
129 	toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /*offsets*/
130 	tosizes[2*i+1] = toffsets[i]; /*offsets to send*/
131   }
132   /*send information to other processors*/
133   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
134   /*build a star forest */
135   nrecvrows = 0;
136   for(i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
137   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
138   nrecvrows = 0;
139   for(i=0; i<nfrom; i++){
140 	for(j=0; j<fromsizes[2*i]; j++){
141 	  remote[nrecvrows].rank    = fromranks[i];
142       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
143 	}
144   }
145   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
146   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
147   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
148   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
149   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
150   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
151   /*message pair <no of is, row> */
152   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
153   for(i=0; i<nrrows; i++){
154 	owner = rrow_ranks[i]; /* processor */
155 	j     = tosizes_temp[owner]; /* index */
156 	todata[toffsets[j]++] = rrow_isids[i];
157 	todata[toffsets[j]++] = remoterows[i];
158   }
159   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
160   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
161   ierr = PetscFree(toffsets);CHKERRQ(ierr);
162   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
163   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
164   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
165   /*deal with remote data */
166   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
167   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
168   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
169   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
170   ierr = PetscFree(fromranks);CHKERRQ(ierr);
171   nrecvrows = 0;
172   for(i=0; i<nto; i++) nrecvrows += tosizes[2*i];
173   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
174   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
175   nrecvrows = 0;
176   for(i=0; i<nto; i++){
177   	for(j=0; j<tosizes[2*i]; j++){
178   	  remote[nrecvrows].rank    = toranks[i];
179       remote[nrecvrows++].index = tosizes[2*i+1]+j;
180   	}
181   }
182   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
183   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
184   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
185   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
186   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
187   /*overlap communication and computation*/
188   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
189   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
190   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
191   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
192   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
193   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
194   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
195   ierr = PetscFree(toranks);CHKERRQ(ierr);
196   ierr = PetscFree(tosizes);CHKERRQ(ierr);
197   ierr = PetscFree(todata);CHKERRQ(ierr);
198   PetscFunctionReturn(0);
199 }
200 
201 #undef __FUNCT__
202 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
203 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
204 {
205   PetscInt         *isz,isz_i,i,j,is_id, data_size;
206   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
207   const PetscInt   *indices_i_temp;
208   PetscErrorCode    ierr;
209 
210   PetscFunctionBegin;
211   max_lsize = 0;
212   ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr);
213   for(i=0; i<nidx; i++){
214    ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
215    max_lsize = lsize>max_lsize ? lsize:max_lsize;
216    isz[i]    = lsize;
217   }
218   ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr);
219   for(i=0; i<nidx; i++){
220     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
221 	ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
222 	ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
223 	ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
224   }
225   /*retrieve information */
226   for(i=0; i<nrecvs; ){
227 	is_id      = recvdata[i++];
228 	data_size  = recvdata[i++];
229 	indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
230 	isz_i      = isz[is_id];
231     for(j=0; j< data_size; j++){
232       col = recvdata[i++];
233       indices_i[isz_i++] = col;
234     }
235     isz[is_id] = isz_i;
236   }
237   /*remove duplicate entities*/
238   for(i=0; i<nidx; i++){
239    indices_i  = indices_temp+(max_lsize+nrecvs)*i;
240    isz_i      = isz[i];
241    ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
242    ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
243   }
244   ierr = PetscFree(isz);CHKERRQ(ierr);
245   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
246   PetscFunctionReturn(0);
247 }
248 
249 #undef __FUNCT__
250 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
251 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
252 {
253   PetscLayout       rmap,cmap;
254   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
255   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
256   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
257   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
258   Mat               amat,bmat;
259   PetscMPIInt       rank;
260   PetscBool         done;
261   MPI_Comm          comm;
262   PetscErrorCode    ierr;
263 
264   PetscFunctionBegin;
265   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
266   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
267   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
268   /* Even if the mat is symmetric, we still assume it is not symmetric*/
269   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
270   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
271   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
272   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
273   /*total number of nonzero values */
274   tnz  = ai[an]+bi[bn];
275   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
276   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
277   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
278   /*longest message */
279   max_fszs = 0;
280   for(i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
281   /*better way to estimate number of nonzero in the mat???*/
282   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
283   for(i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
284   rows_pos  = 0;
285   totalrows = 0;
286   for(i=0; i<nfrom; i++){
287 	ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
288 	/*group data*/
289     for(j=0; j<fromsizes[2*i]; j+=2){
290       is_id                       = fromrows[rows_pos++];/*no of is*/
291       rows_i                      = rows_data[is_id];
292       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
293     }
294     /*estimate a space to avoid multiple allocations  */
295     for(j=0; j<nidx; j++){
296       indvc_ij = 0;
297       rows_i   = rows_data[j];
298       for(l=0; l<rows_pos_i[j]; l++){
299         row    = rows_i[l]-rstart;
300         start  = ai[row];
301         end    = ai[row+1];
302         for(k=start; k<end; k++){ /* Amat */
303           col = aj[k] + cstart;
304           /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr);
305           if(owner != fromranks[i])*/
306           indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/
307         }
308         start = bi[row];
309         end   = bi[row+1];
310         for(k=start; k<end; k++) { /* Bmat */
311           col = gcols[bj[k]];
312           /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr);
313           if(owner != fromranks[i]) */
314           indices_tmp[indvc_ij++] = col;
315         }
316       }
317       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
318       indv_counts[i*nidx+j] = indvc_ij;
319       totalrows            += indvc_ij;
320     }
321   }
322   /*message triple <no of is, number of rows, rows> */
323   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
324   totalrows = 0;
325   rows_pos  = 0;
326   /* use this code again */
327   for(i=0;i<nfrom;i++){
328   	ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
329     for(j=0; j<fromsizes[2*i]; j+=2){
330       is_id                       = fromrows[rows_pos++];
331       rows_i                      = rows_data[is_id];
332       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
333     }
334     /* add data  */
335     for(j=0; j<nidx; j++){
336       if(!indv_counts[i*nidx+j]) continue;
337       indvc_ij = 0;
338       sbdata[totalrows++] = j;
339       sbdata[totalrows++] = indv_counts[i*nidx+j];
340       sbsizes[2*i]       += 2;
341       rows_i              = rows_data[j];
342       for(l=0; l<rows_pos_i[j]; l++){
343         row   = rows_i[l]-rstart;
344         start = ai[row];
345         end   = ai[row+1];
346         for(k=start; k<end; k++){ /* Amat */
347           col = aj[k] + cstart;
348           indices_tmp[indvc_ij++] = col;
349         }
350         start = bi[row];
351         end   = bi[row+1];
352         for(k=start; k<end; k++) { /* Bmat */
353           col = gcols[bj[k]];
354           indices_tmp[indvc_ij++] = col;
355         }
356       }
357       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
358       sbsizes[2*i]  += indvc_ij;
359       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
360       totalrows += indvc_ij;
361     }
362   }
363   /* offsets */
364   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
365   for(i=0; i<nfrom; i++){
366 	offsets[i+1]   = offsets[i] + sbsizes[2*i];
367 	sbsizes[2*i+1] = offsets[i];
368   }
369   ierr = PetscFree(offsets);CHKERRQ(ierr);
370   if(sbrowsizes) *sbrowsizes = sbsizes;
371   if(sbrows) *sbrows = sbdata;
372   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
373   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
374   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
375   PetscFunctionReturn(0);
376 }
377 
378 #undef __FUNCT__
379 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
380 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
381 {
382   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
383   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
384   PetscInt          lsize,lsize_tmp,owner;
385   PetscMPIInt       rank;
386   Mat   	        amat,bmat;
387   PetscBool         done;
388   PetscLayout       cmap,rmap;
389   MPI_Comm          comm;
390   PetscErrorCode    ierr;
391 
392   PetscFunctionBegin;
393   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
394   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
395   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
396   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
397   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
398   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
399   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
400   /*is it a safe way to compute number of nonzero values ?*/
401   tnz  = ai[an]+bi[bn];
402   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
403   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
404   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
405   /*it is a better way to estimate memory than the old implementation
406    * where global size of matrix is used
407    * */
408   ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr);
409   for(i=0; i<nidx; i++) {
410 	ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
411 	ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
412 	lsize_tmp = 0;
413     for(j=0; j<lsize; j++) {
414        owner = -1;
415        row   = indices[j];
416        ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
417        if(owner != rank) continue;
418        /*local number*/
419        row  -= rstart;
420        start = ai[row];
421        end   = ai[row+1];
422        for(k=start; k<end; k++) { /* Amat */
423          col = aj[k] + cstart;
424          indices_temp[lsize_tmp++] = col;
425        }
426        start = bi[row];
427        end   = bi[row+1];
428        for(k=start; k<end; k++) { /* Bmat */
429          col = gcols[bj[k]];
430          indices_temp[lsize_tmp++] = col;
431        }
432     }
433    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
434    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
435    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
436    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
437   }
438   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
439   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
440   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
441   PetscFunctionReturn(0);
442 }
443 
444 
445 /*
446   Sample message format:
447   If a processor A wants processor B to process some elements corresponding
448   to index sets is[1],is[5]
449   mesg [0] = 2   (no of index sets in the mesg)
450   -----------
451   mesg [1] = 1 => is[1]
452   mesg [2] = sizeof(is[1]);
453   -----------
454   mesg [3] = 5  => is[5]
455   mesg [4] = sizeof(is[5]);
456   -----------
457   mesg [5]
458   mesg [n]  datas[1]
459   -----------
460   mesg[n+1]
461   mesg[m]  data(is[5])
462   -----------
463 
464   Notes:
465   nrqs - no of requests sent (or to be sent out)
466   nrqr - no of requests recieved (which have to be or which have been processed
467 */
468 #undef __FUNCT__
469 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
470 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
471 {
472   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
473   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
474   const PetscInt **idx,*idx_i;
475   PetscInt       *n,**data,len;
476   PetscErrorCode ierr;
477   PetscMPIInt    size,rank,tag1,tag2;
478   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
479   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
480   PetscBT        *table;
481   MPI_Comm       comm;
482   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
483   MPI_Status     *s_status,*recv_status;
484   char           *t_p;
485 
486   PetscFunctionBegin;
487   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
488   size = c->size;
489   rank = c->rank;
490   M    = C->rmap->N;
491 
492   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
493   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
494 
495   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
496 
497   for (i=0; i<imax; i++) {
498     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
499     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
500   }
501 
502   /* evaluate communication - mesg to who,length of mesg, and buffer space
503      required. Based on this, buffers are allocated, and data copied into them*/
504   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
505   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
506   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
507   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
508   for (i=0; i<imax; i++) {
509     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
510     idx_i = idx[i];
511     len   = n[i];
512     for (j=0; j<len; j++) {
513       row = idx_i[j];
514       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
515       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
516       w4[proc]++;
517     }
518     for (j=0; j<size; j++) {
519       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
520     }
521   }
522 
523   nrqs     = 0;              /* no of outgoing messages */
524   msz      = 0;              /* total mesg length (for all proc */
525   w1[rank] = 0;              /* no mesg sent to intself */
526   w3[rank] = 0;
527   for (i=0; i<size; i++) {
528     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
529   }
530   /* pa - is list of processors to communicate with */
531   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
532   for (i=0,j=0; i<size; i++) {
533     if (w1[i]) {pa[j] = i; j++;}
534   }
535 
536   /* Each message would have a header = 1 + 2*(no of IS) + data */
537   for (i=0; i<nrqs; i++) {
538     j      = pa[i];
539     w1[j] += w2[j] + 2*w3[j];
540     msz   += w1[j];
541   }
542 
543   /* Determine the number of messages to expect, their lengths, from from-ids */
544   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
545   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
546 
547   /* Now post the Irecvs corresponding to these messages */
548   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
549 
550   /* Allocate Memory for outgoing messages */
551   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
552   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
553   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
554 
555   {
556     PetscInt *iptr = tmp,ict  = 0;
557     for (i=0; i<nrqs; i++) {
558       j         = pa[i];
559       iptr     +=  ict;
560       outdat[j] = iptr;
561       ict       = w1[j];
562     }
563   }
564 
565   /* Form the outgoing messages */
566   /*plug in the headers*/
567   for (i=0; i<nrqs; i++) {
568     j            = pa[i];
569     outdat[j][0] = 0;
570     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
571     ptr[j]       = outdat[j] + 2*w3[j] + 1;
572   }
573 
574   /* Memory for doing local proc's work*/
575   {
576     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
577 
578     for (i=0; i<imax; i++) {
579       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
580       data[i]  = d_p + M*i;
581     }
582   }
583 
584   /* Parse the IS and update local tables and the outgoing buf with the data*/
585   {
586     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
587     PetscBT  table_i;
588 
589     for (i=0; i<imax; i++) {
590       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
591       n_i     = n[i];
592       table_i = table[i];
593       idx_i   = idx[i];
594       data_i  = data[i];
595       isz_i   = isz[i];
596       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
597         row  = idx_i[j];
598         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
599         if (proc != rank) { /* copy to the outgoing buffer */
600           ctr[proc]++;
601           *ptr[proc] = row;
602           ptr[proc]++;
603         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
604       }
605       /* Update the headers for the current IS */
606       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
607         if ((ctr_j = ctr[j])) {
608           outdat_j        = outdat[j];
609           k               = ++outdat_j[0];
610           outdat_j[2*k]   = ctr_j;
611           outdat_j[2*k-1] = i;
612         }
613       }
614       isz[i] = isz_i;
615     }
616   }
617 
618   /*  Now  post the sends */
619   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
620   for (i=0; i<nrqs; ++i) {
621     j    = pa[i];
622     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
623   }
624 
625   /* No longer need the original indices*/
626   for (i=0; i<imax; ++i) {
627     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
628   }
629   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
630 
631   for (i=0; i<imax; ++i) {
632     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
633   }
634 
635   /* Do Local work*/
636   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
637 
638   /* Receive messages*/
639   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
640   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
641 
642   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
643   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
644 
645   /* Phase 1 sends are complete - deallocate buffers */
646   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
647   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
648 
649   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
650   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
651   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
652   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
653   ierr = PetscFree(rbuf);CHKERRQ(ierr);
654 
655 
656   /* Send the data back*/
657   /* Do a global reduction to know the buffer space req for incoming messages*/
658   {
659     PetscMPIInt *rw1;
660 
661     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
662 
663     for (i=0; i<nrqr; ++i) {
664       proc = recv_status[i].MPI_SOURCE;
665 
666       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
667       rw1[proc] = isz1[i];
668     }
669     ierr = PetscFree(onodes1);CHKERRQ(ierr);
670     ierr = PetscFree(olengths1);CHKERRQ(ierr);
671 
672     /* Determine the number of messages to expect, their lengths, from from-ids */
673     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
674     ierr = PetscFree(rw1);CHKERRQ(ierr);
675   }
676   /* Now post the Irecvs corresponding to these messages */
677   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
678 
679   /*  Now  post the sends */
680   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
681   for (i=0; i<nrqr; ++i) {
682     j    = recv_status[i].MPI_SOURCE;
683     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
684   }
685 
686   /* receive work done on other processors*/
687   {
688     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
689     PetscMPIInt idex;
690     PetscBT     table_i;
691     MPI_Status  *status2;
692 
693     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
694     for (i=0; i<nrqs; ++i) {
695       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
696       /* Process the message*/
697       rbuf2_i = rbuf2[idex];
698       ct1     = 2*rbuf2_i[0]+1;
699       jmax    = rbuf2[idex][0];
700       for (j=1; j<=jmax; j++) {
701         max     = rbuf2_i[2*j];
702         is_no   = rbuf2_i[2*j-1];
703         isz_i   = isz[is_no];
704         data_i  = data[is_no];
705         table_i = table[is_no];
706         for (k=0; k<max; k++,ct1++) {
707           row = rbuf2_i[ct1];
708           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
709         }
710         isz[is_no] = isz_i;
711       }
712     }
713 
714     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
715     ierr = PetscFree(status2);CHKERRQ(ierr);
716   }
717 
718   for (i=0; i<imax; ++i) {
719     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
720   }
721 
722   ierr = PetscFree(onodes2);CHKERRQ(ierr);
723   ierr = PetscFree(olengths2);CHKERRQ(ierr);
724 
725   ierr = PetscFree(pa);CHKERRQ(ierr);
726   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
727   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
728   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
729   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
730   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
731   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
732   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
733   ierr = PetscFree(s_status);CHKERRQ(ierr);
734   ierr = PetscFree(recv_status);CHKERRQ(ierr);
735   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
736   ierr = PetscFree(xdata);CHKERRQ(ierr);
737   ierr = PetscFree(isz1);CHKERRQ(ierr);
738   PetscFunctionReturn(0);
739 }
740 
741 #undef __FUNCT__
742 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
743 /*
744    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
745        the work on the local processor.
746 
747      Inputs:
748       C      - MAT_MPIAIJ;
749       imax - total no of index sets processed at a time;
750       table  - an array of char - size = m bits.
751 
752      Output:
753       isz    - array containing the count of the solution elements corresponding
754                to each index set;
755       data   - pointer to the solutions
756 */
757 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
758 {
759   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
760   Mat        A  = c->A,B = c->B;
761   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
762   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
763   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
764   PetscBT    table_i;
765 
766   PetscFunctionBegin;
767   rstart = C->rmap->rstart;
768   cstart = C->cmap->rstart;
769   ai     = a->i;
770   aj     = a->j;
771   bi     = b->i;
772   bj     = b->j;
773   garray = c->garray;
774 
775 
776   for (i=0; i<imax; i++) {
777     data_i  = data[i];
778     table_i = table[i];
779     isz_i   = isz[i];
780     for (j=0,max=isz[i]; j<max; j++) {
781       row   = data_i[j] - rstart;
782       start = ai[row];
783       end   = ai[row+1];
784       for (k=start; k<end; k++) { /* Amat */
785         val = aj[k] + cstart;
786         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
787       }
788       start = bi[row];
789       end   = bi[row+1];
790       for (k=start; k<end; k++) { /* Bmat */
791         val = garray[bj[k]];
792         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
793       }
794     }
795     isz[i] = isz_i;
796   }
797   PetscFunctionReturn(0);
798 }
799 
800 #undef __FUNCT__
801 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
802 /*
803       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
804          and return the output
805 
806          Input:
807            C    - the matrix
808            nrqr - no of messages being processed.
809            rbuf - an array of pointers to the recieved requests
810 
811          Output:
812            xdata - array of messages to be sent back
813            isz1  - size of each message
814 
815   For better efficiency perhaps we should malloc separately each xdata[i],
816 then if a remalloc is required we need only copy the data for that one row
817 rather then all previous rows as it is now where a single large chunck of
818 memory is used.
819 
820 */
821 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
822 {
823   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
824   Mat            A  = c->A,B = c->B;
825   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
826   PetscErrorCode ierr;
827   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
828   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
829   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
830   PetscInt       *rbuf_i,kmax,rbuf_0;
831   PetscBT        xtable;
832 
833   PetscFunctionBegin;
834   m      = C->rmap->N;
835   rstart = C->rmap->rstart;
836   cstart = C->cmap->rstart;
837   ai     = a->i;
838   aj     = a->j;
839   bi     = b->i;
840   bj     = b->j;
841   garray = c->garray;
842 
843 
844   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
845     rbuf_i =  rbuf[i];
846     rbuf_0 =  rbuf_i[0];
847     ct    += rbuf_0;
848     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
849   }
850 
851   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
852   else max1 = 1;
853   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
854   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
855   ++no_malloc;
856   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
857   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
858 
859   ct3 = 0;
860   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
861     rbuf_i =  rbuf[i];
862     rbuf_0 =  rbuf_i[0];
863     ct1    =  2*rbuf_0+1;
864     ct2    =  ct1;
865     ct3   += ct1;
866     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
867       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
868       oct2 = ct2;
869       kmax = rbuf_i[2*j];
870       for (k=0; k<kmax; k++,ct1++) {
871         row = rbuf_i[ct1];
872         if (!PetscBTLookupSet(xtable,row)) {
873           if (!(ct3 < mem_estimate)) {
874             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
875             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
876             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
877             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
878             xdata[0]     = tmp;
879             mem_estimate = new_estimate; ++no_malloc;
880             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
881           }
882           xdata[i][ct2++] = row;
883           ct3++;
884         }
885       }
886       for (k=oct2,max2=ct2; k<max2; k++) {
887         row   = xdata[i][k] - rstart;
888         start = ai[row];
889         end   = ai[row+1];
890         for (l=start; l<end; l++) {
891           val = aj[l] + cstart;
892           if (!PetscBTLookupSet(xtable,val)) {
893             if (!(ct3 < mem_estimate)) {
894               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
895               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
896               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
897               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
898               xdata[0]     = tmp;
899               mem_estimate = new_estimate; ++no_malloc;
900               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
901             }
902             xdata[i][ct2++] = val;
903             ct3++;
904           }
905         }
906         start = bi[row];
907         end   = bi[row+1];
908         for (l=start; l<end; l++) {
909           val = garray[bj[l]];
910           if (!PetscBTLookupSet(xtable,val)) {
911             if (!(ct3 < mem_estimate)) {
912               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
913               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
914               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
915               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
916               xdata[0]     = tmp;
917               mem_estimate = new_estimate; ++no_malloc;
918               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
919             }
920             xdata[i][ct2++] = val;
921             ct3++;
922           }
923         }
924       }
925       /* Update the header*/
926       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
927       xdata[i][2*j-1] = rbuf_i[2*j-1];
928     }
929     xdata[i][0] = rbuf_0;
930     xdata[i+1]  = xdata[i] + ct2;
931     isz1[i]     = ct2; /* size of each message */
932   }
933   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
934   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
935   PetscFunctionReturn(0);
936 }
937 /* -------------------------------------------------------------------------*/
938 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
939 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
940 /*
941     Every processor gets the entire matrix
942 */
943 #undef __FUNCT__
944 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
945 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
946 {
947   Mat            B;
948   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
949   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
950   PetscErrorCode ierr;
951   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
952   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
953   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
954   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
955 
956   PetscFunctionBegin;
957   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
958   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
959 
960   if (scall == MAT_INITIAL_MATRIX) {
961     /* ----------------------------------------------------------------
962          Tell every processor the number of nonzeros per row
963     */
964     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
965     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
966       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
967     }
968     sendcount = A->rmap->rend - A->rmap->rstart;
969     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
970     for (i=0; i<size; i++) {
971       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
972       displs[i]     = A->rmap->range[i];
973     }
974 #if defined(PETSC_HAVE_MPI_IN_PLACE)
975     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
976 #else
977     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
978 #endif
979     /* ---------------------------------------------------------------
980          Create the sequential matrix of the same type as the local block diagonal
981     */
982     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
983     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
984     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
985     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
986     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
987     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
988     **Bin = B;
989     b     = (Mat_SeqAIJ*)B->data;
990 
991     /*--------------------------------------------------------------------
992        Copy my part of matrix column indices over
993     */
994     sendcount  = ad->nz + bd->nz;
995     jsendbuf   = b->j + b->i[rstarts[rank]];
996     a_jsendbuf = ad->j;
997     b_jsendbuf = bd->j;
998     n          = A->rmap->rend - A->rmap->rstart;
999     cnt        = 0;
1000     for (i=0; i<n; i++) {
1001 
1002       /* put in lower diagonal portion */
1003       m = bd->i[i+1] - bd->i[i];
1004       while (m > 0) {
1005         /* is it above diagonal (in bd (compressed) numbering) */
1006         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1007         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1008         m--;
1009       }
1010 
1011       /* put in diagonal portion */
1012       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1013         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1014       }
1015 
1016       /* put in upper diagonal portion */
1017       while (m-- > 0) {
1018         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1019       }
1020     }
1021     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1022 
1023     /*--------------------------------------------------------------------
1024        Gather all column indices to all processors
1025     */
1026     for (i=0; i<size; i++) {
1027       recvcounts[i] = 0;
1028       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1029         recvcounts[i] += lens[j];
1030       }
1031     }
1032     displs[0] = 0;
1033     for (i=1; i<size; i++) {
1034       displs[i] = displs[i-1] + recvcounts[i-1];
1035     }
1036 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1037     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1038 #else
1039     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1040 #endif
1041     /*--------------------------------------------------------------------
1042         Assemble the matrix into useable form (note numerical values not yet set)
1043     */
1044     /* set the b->ilen (length of each row) values */
1045     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1046     /* set the b->i indices */
1047     b->i[0] = 0;
1048     for (i=1; i<=A->rmap->N; i++) {
1049       b->i[i] = b->i[i-1] + lens[i-1];
1050     }
1051     ierr = PetscFree(lens);CHKERRQ(ierr);
1052     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1053     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1054 
1055   } else {
1056     B = **Bin;
1057     b = (Mat_SeqAIJ*)B->data;
1058   }
1059 
1060   /*--------------------------------------------------------------------
1061        Copy my part of matrix numerical values into the values location
1062   */
1063   if (flag == MAT_GET_VALUES) {
1064     sendcount = ad->nz + bd->nz;
1065     sendbuf   = b->a + b->i[rstarts[rank]];
1066     a_sendbuf = ad->a;
1067     b_sendbuf = bd->a;
1068     b_sendj   = bd->j;
1069     n         = A->rmap->rend - A->rmap->rstart;
1070     cnt       = 0;
1071     for (i=0; i<n; i++) {
1072 
1073       /* put in lower diagonal portion */
1074       m = bd->i[i+1] - bd->i[i];
1075       while (m > 0) {
1076         /* is it above diagonal (in bd (compressed) numbering) */
1077         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1078         sendbuf[cnt++] = *b_sendbuf++;
1079         m--;
1080         b_sendj++;
1081       }
1082 
1083       /* put in diagonal portion */
1084       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1085         sendbuf[cnt++] = *a_sendbuf++;
1086       }
1087 
1088       /* put in upper diagonal portion */
1089       while (m-- > 0) {
1090         sendbuf[cnt++] = *b_sendbuf++;
1091         b_sendj++;
1092       }
1093     }
1094     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1095 
1096     /* -----------------------------------------------------------------
1097        Gather all numerical values to all processors
1098     */
1099     if (!recvcounts) {
1100       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1101     }
1102     for (i=0; i<size; i++) {
1103       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1104     }
1105     displs[0] = 0;
1106     for (i=1; i<size; i++) {
1107       displs[i] = displs[i-1] + recvcounts[i-1];
1108     }
1109     recvbuf = b->a;
1110 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1111     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1112 #else
1113     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1114 #endif
1115   }  /* endof (flag == MAT_GET_VALUES) */
1116   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1117 
1118   if (A->symmetric) {
1119     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1120   } else if (A->hermitian) {
1121     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1122   } else if (A->structurally_symmetric) {
1123     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1124   }
1125   PetscFunctionReturn(0);
1126 }
1127 
1128 
1129 
1130 #undef __FUNCT__
1131 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
1132 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1133 {
1134   PetscErrorCode ierr;
1135   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1136   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1137 
1138   PetscFunctionBegin;
1139   /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
1140   /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level.
1141      However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that
1142      take care to order the result correctly by assembling it with MatSetValues() (after preallocating).
1143    */
1144   for (i = 0; i < ismax; ++i) {
1145     PetscBool sorted;
1146     ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr);
1147     if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i);
1148   }
1149 
1150   /*
1151        Check for special case: each processor gets entire matrix
1152   */
1153   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1154     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1155     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1156     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1157     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1158     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1159       wantallmatrix = PETSC_TRUE;
1160 
1161       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1162     }
1163   }
1164   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1165   if (twantallmatrix) {
1166     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
1167     PetscFunctionReturn(0);
1168   }
1169 
1170   /* Allocate memory to hold all the submatrices */
1171   if (scall != MAT_REUSE_MATRIX) {
1172     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
1173   }
1174 
1175   /* Check for special case: each processor gets entire matrix columns */
1176   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
1177   for (i=0; i<ismax; i++) {
1178     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
1179     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
1180     if (colflag && ncol == C->cmap->N) {
1181       allcolumns[i] = PETSC_TRUE;
1182     } else {
1183       allcolumns[i] = PETSC_FALSE;
1184     }
1185   }
1186 
1187   /* Determine the number of stages through which submatrices are done */
1188   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1189 
1190   /*
1191      Each stage will extract nmax submatrices.
1192      nmax is determined by the matrix column dimension.
1193      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1194   */
1195   if (!nmax) nmax = 1;
1196   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1197 
1198   /* Make sure every processor loops through the nstages */
1199   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1200 
1201   for (i=0,pos=0; i<nstages; i++) {
1202     if (pos+nmax <= ismax) max_no = nmax;
1203     else if (pos == ismax) max_no = 0;
1204     else                   max_no = ismax-pos;
1205     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
1206     pos += max_no;
1207   }
1208 
1209   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
1210   PetscFunctionReturn(0);
1211 }
1212 
1213 /* -------------------------------------------------------------------------*/
1214 #undef __FUNCT__
1215 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
1216 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1217 {
1218   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1219   Mat            A  = c->A;
1220   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1221   const PetscInt **icol,**irow;
1222   PetscInt       *nrow,*ncol,start;
1223   PetscErrorCode ierr;
1224   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1225   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1226   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1227   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1228   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1229 #if defined(PETSC_USE_CTABLE)
1230   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1231 #else
1232   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1233 #endif
1234   const PetscInt *irow_i;
1235   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1236   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1237   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1238   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1239   MPI_Status     *r_status3,*r_status4,*s_status4;
1240   MPI_Comm       comm;
1241   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1242   PetscMPIInt    *onodes1,*olengths1;
1243   PetscMPIInt    idex,idex2,end;
1244 
1245   PetscFunctionBegin;
1246   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1247   tag0 = ((PetscObject)C)->tag;
1248   size = c->size;
1249   rank = c->rank;
1250 
1251   /* Get some new tags to keep the communication clean */
1252   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1253   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1254   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1255 
1256   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
1257 
1258   for (i=0; i<ismax; i++) {
1259     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
1260     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
1261     if (allcolumns[i]) {
1262       icol[i] = NULL;
1263       ncol[i] = C->cmap->N;
1264     } else {
1265       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
1266       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
1267     }
1268   }
1269 
1270   /* evaluate communication - mesg to who, length of mesg, and buffer space
1271      required. Based on this, buffers are allocated, and data copied into them*/
1272   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
1273   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1274   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1275   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1276   for (i=0; i<ismax; i++) {
1277     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
1278     jmax   = nrow[i];
1279     irow_i = irow[i];
1280     for (j=0; j<jmax; j++) {
1281       l   = 0;
1282       row = irow_i[j];
1283       while (row >= C->rmap->range[l+1]) l++;
1284       proc = l;
1285       w4[proc]++;
1286     }
1287     for (j=0; j<size; j++) {
1288       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1289     }
1290   }
1291 
1292   nrqs     = 0;              /* no of outgoing messages */
1293   msz      = 0;              /* total mesg length (for all procs) */
1294   w1[rank] = 0;              /* no mesg sent to self */
1295   w3[rank] = 0;
1296   for (i=0; i<size; i++) {
1297     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1298   }
1299   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1300   for (i=0,j=0; i<size; i++) {
1301     if (w1[i]) { pa[j] = i; j++; }
1302   }
1303 
1304   /* Each message would have a header = 1 + 2*(no of IS) + data */
1305   for (i=0; i<nrqs; i++) {
1306     j      = pa[i];
1307     w1[j] += w2[j] + 2* w3[j];
1308     msz   += w1[j];
1309   }
1310   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1311 
1312   /* Determine the number of messages to expect, their lengths, from from-ids */
1313   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1314   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1315 
1316   /* Now post the Irecvs corresponding to these messages */
1317   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1318 
1319   ierr = PetscFree(onodes1);CHKERRQ(ierr);
1320   ierr = PetscFree(olengths1);CHKERRQ(ierr);
1321 
1322   /* Allocate Memory for outgoing messages */
1323   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1324   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1325   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1326 
1327   {
1328     PetscInt *iptr = tmp,ict = 0;
1329     for (i=0; i<nrqs; i++) {
1330       j        = pa[i];
1331       iptr    += ict;
1332       sbuf1[j] = iptr;
1333       ict      = w1[j];
1334     }
1335   }
1336 
1337   /* Form the outgoing messages */
1338   /* Initialize the header space */
1339   for (i=0; i<nrqs; i++) {
1340     j           = pa[i];
1341     sbuf1[j][0] = 0;
1342     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
1343     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1344   }
1345 
1346   /* Parse the isrow and copy data into outbuf */
1347   for (i=0; i<ismax; i++) {
1348     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1349     irow_i = irow[i];
1350     jmax   = nrow[i];
1351     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1352       l   = 0;
1353       row = irow_i[j];
1354       while (row >= C->rmap->range[l+1]) l++;
1355       proc = l;
1356       if (proc != rank) { /* copy to the outgoing buf*/
1357         ctr[proc]++;
1358         *ptr[proc] = row;
1359         ptr[proc]++;
1360       }
1361     }
1362     /* Update the headers for the current IS */
1363     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1364       if ((ctr_j = ctr[j])) {
1365         sbuf1_j        = sbuf1[j];
1366         k              = ++sbuf1_j[0];
1367         sbuf1_j[2*k]   = ctr_j;
1368         sbuf1_j[2*k-1] = i;
1369       }
1370     }
1371   }
1372 
1373   /*  Now  post the sends */
1374   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1375   for (i=0; i<nrqs; ++i) {
1376     j    = pa[i];
1377     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
1378   }
1379 
1380   /* Post Receives to capture the buffer size */
1381   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
1382   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
1383   rbuf2[0] = tmp + msz;
1384   for (i=1; i<nrqs; ++i) {
1385     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1386   }
1387   for (i=0; i<nrqs; ++i) {
1388     j    = pa[i];
1389     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
1390   }
1391 
1392   /* Send to other procs the buf size they should allocate */
1393 
1394 
1395   /* Receive messages*/
1396   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
1397   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1398   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
1399   {
1400     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1401     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1402     PetscInt   *sbuf2_i;
1403 
1404     for (i=0; i<nrqr; ++i) {
1405       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1406 
1407       req_size[idex] = 0;
1408       rbuf1_i        = rbuf1[idex];
1409       start          = 2*rbuf1_i[0] + 1;
1410       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1411       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
1412       sbuf2_i        = sbuf2[idex];
1413       for (j=start; j<end; j++) {
1414         id              = rbuf1_i[j] - rstart;
1415         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1416         sbuf2_i[j]      = ncols;
1417         req_size[idex] += ncols;
1418       }
1419       req_source[idex] = r_status1[i].MPI_SOURCE;
1420       /* form the header */
1421       sbuf2_i[0] = req_size[idex];
1422       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1423 
1424       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1425     }
1426   }
1427   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1428   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1429 
1430   /*  recv buffer sizes */
1431   /* Receive messages*/
1432 
1433   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1434   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1435   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1436   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1437   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1438 
1439   for (i=0; i<nrqs; ++i) {
1440     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1441     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1442     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1443     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1444     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1445   }
1446   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1447   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1448 
1449   /* Wait on sends1 and sends2 */
1450   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1451   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1452 
1453   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1454   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1455   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1456   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1457   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1458   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1459 
1460   /* Now allocate buffers for a->j, and send them off */
1461   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1462   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1463   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1464   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1465 
1466   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1467   {
1468     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1469     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1470     PetscInt cend = C->cmap->rend;
1471     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1472 
1473     for (i=0; i<nrqr; i++) {
1474       rbuf1_i   = rbuf1[i];
1475       sbuf_aj_i = sbuf_aj[i];
1476       ct1       = 2*rbuf1_i[0] + 1;
1477       ct2       = 0;
1478       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1479         kmax = rbuf1[i][2*j];
1480         for (k=0; k<kmax; k++,ct1++) {
1481           row    = rbuf1_i[ct1] - rstart;
1482           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1483           ncols  = nzA + nzB;
1484           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1485 
1486           /* load the column indices for this row into cols*/
1487           cols = sbuf_aj_i + ct2;
1488 
1489           lwrite = 0;
1490           for (l=0; l<nzB; l++) {
1491             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1492           }
1493           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1494           for (l=0; l<nzB; l++) {
1495             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1496           }
1497 
1498           ct2 += ncols;
1499         }
1500       }
1501       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1502     }
1503   }
1504   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1505   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1506 
1507   /* Allocate buffers for a->a, and send them off */
1508   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1509   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1510   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1511   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1512 
1513   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1514   {
1515     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1516     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1517     PetscInt    cend   = C->cmap->rend;
1518     PetscInt    *b_j   = b->j;
1519     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1520 
1521     for (i=0; i<nrqr; i++) {
1522       rbuf1_i   = rbuf1[i];
1523       sbuf_aa_i = sbuf_aa[i];
1524       ct1       = 2*rbuf1_i[0]+1;
1525       ct2       = 0;
1526       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1527         kmax = rbuf1_i[2*j];
1528         for (k=0; k<kmax; k++,ct1++) {
1529           row    = rbuf1_i[ct1] - rstart;
1530           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1531           ncols  = nzA + nzB;
1532           cworkB = b_j + b_i[row];
1533           vworkA = a_a + a_i[row];
1534           vworkB = b_a + b_i[row];
1535 
1536           /* load the column values for this row into vals*/
1537           vals = sbuf_aa_i+ct2;
1538 
1539           lwrite = 0;
1540           for (l=0; l<nzB; l++) {
1541             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1542           }
1543           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1544           for (l=0; l<nzB; l++) {
1545             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1546           }
1547 
1548           ct2 += ncols;
1549         }
1550       }
1551       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1552     }
1553   }
1554   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1555   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1556   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1557   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1558 
1559   /* Form the matrix */
1560   /* create col map: global col of C -> local col of submatrices */
1561   {
1562     const PetscInt *icol_i;
1563 #if defined(PETSC_USE_CTABLE)
1564     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1565     for (i=0; i<ismax; i++) {
1566       if (!allcolumns[i]) {
1567         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1568 
1569         jmax   = ncol[i];
1570         icol_i = icol[i];
1571         cmap_i = cmap[i];
1572         for (j=0; j<jmax; j++) {
1573           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1574         }
1575       } else {
1576         cmap[i] = NULL;
1577       }
1578     }
1579 #else
1580     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1581     for (i=0; i<ismax; i++) {
1582       if (!allcolumns[i]) {
1583         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1584         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1585         jmax   = ncol[i];
1586         icol_i = icol[i];
1587         cmap_i = cmap[i];
1588         for (j=0; j<jmax; j++) {
1589           cmap_i[icol_i[j]] = j+1;
1590         }
1591       } else {
1592         cmap[i] = NULL;
1593       }
1594     }
1595 #endif
1596   }
1597 
1598   /* Create lens which is required for MatCreate... */
1599   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1600   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1601   if (ismax) {
1602     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1603     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1604   }
1605   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1606 
1607   /* Update lens from local data */
1608   for (i=0; i<ismax; i++) {
1609     jmax = nrow[i];
1610     if (!allcolumns[i]) cmap_i = cmap[i];
1611     irow_i = irow[i];
1612     lens_i = lens[i];
1613     for (j=0; j<jmax; j++) {
1614       l   = 0;
1615       row = irow_i[j];
1616       while (row >= C->rmap->range[l+1]) l++;
1617       proc = l;
1618       if (proc == rank) {
1619         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1620         if (!allcolumns[i]) {
1621           for (k=0; k<ncols; k++) {
1622 #if defined(PETSC_USE_CTABLE)
1623             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1624 #else
1625             tcol = cmap_i[cols[k]];
1626 #endif
1627             if (tcol) lens_i[j]++;
1628           }
1629         } else { /* allcolumns */
1630           lens_i[j] = ncols;
1631         }
1632         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1633       }
1634     }
1635   }
1636 
1637   /* Create row map: global row of C -> local row of submatrices */
1638 #if defined(PETSC_USE_CTABLE)
1639   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1640   for (i=0; i<ismax; i++) {
1641     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1642     rmap_i = rmap[i];
1643     irow_i = irow[i];
1644     jmax   = nrow[i];
1645     for (j=0; j<jmax; j++) {
1646       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1647     }
1648   }
1649 #else
1650   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1651   if (ismax) {
1652     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1653     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1654   }
1655   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1656   for (i=0; i<ismax; i++) {
1657     rmap_i = rmap[i];
1658     irow_i = irow[i];
1659     jmax   = nrow[i];
1660     for (j=0; j<jmax; j++) {
1661       rmap_i[irow_i[j]] = j;
1662     }
1663   }
1664 #endif
1665 
1666   /* Update lens from offproc data */
1667   {
1668     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1669 
1670     for (tmp2=0; tmp2<nrqs; tmp2++) {
1671       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1672       idex    = pa[idex2];
1673       sbuf1_i = sbuf1[idex];
1674       jmax    = sbuf1_i[0];
1675       ct1     = 2*jmax+1;
1676       ct2     = 0;
1677       rbuf2_i = rbuf2[idex2];
1678       rbuf3_i = rbuf3[idex2];
1679       for (j=1; j<=jmax; j++) {
1680         is_no  = sbuf1_i[2*j-1];
1681         max1   = sbuf1_i[2*j];
1682         lens_i = lens[is_no];
1683         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1684         rmap_i = rmap[is_no];
1685         for (k=0; k<max1; k++,ct1++) {
1686 #if defined(PETSC_USE_CTABLE)
1687           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1688           row--;
1689           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1690 #else
1691           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1692 #endif
1693           max2 = rbuf2_i[ct1];
1694           for (l=0; l<max2; l++,ct2++) {
1695             if (!allcolumns[is_no]) {
1696 #if defined(PETSC_USE_CTABLE)
1697               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1698 #else
1699               tcol = cmap_i[rbuf3_i[ct2]];
1700 #endif
1701               if (tcol) lens_i[row]++;
1702             } else { /* allcolumns */
1703               lens_i[row]++; /* lens_i[row] += max2 ? */
1704             }
1705           }
1706         }
1707       }
1708     }
1709   }
1710   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1711   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1712   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1713   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1714   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1715 
1716   /* Create the submatrices */
1717   if (scall == MAT_REUSE_MATRIX) {
1718     PetscBool flag;
1719 
1720     /*
1721         Assumes new rows are same length as the old rows,hence bug!
1722     */
1723     for (i=0; i<ismax; i++) {
1724       mat = (Mat_SeqAIJ*)(submats[i]->data);
1725       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1726       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1727       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1728       /* Initial matrix as if empty */
1729       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1730 
1731       submats[i]->factortype = C->factortype;
1732     }
1733   } else {
1734     for (i=0; i<ismax; i++) {
1735       PetscInt rbs,cbs;
1736       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1737       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1738 
1739       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1740       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1741 
1742       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1743       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1744       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1745     }
1746   }
1747 
1748   /* Assemble the matrices */
1749   /* First assemble the local rows */
1750   {
1751     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1752     PetscScalar *imat_a;
1753 
1754     for (i=0; i<ismax; i++) {
1755       mat       = (Mat_SeqAIJ*)submats[i]->data;
1756       imat_ilen = mat->ilen;
1757       imat_j    = mat->j;
1758       imat_i    = mat->i;
1759       imat_a    = mat->a;
1760 
1761       if (!allcolumns[i]) cmap_i = cmap[i];
1762       rmap_i = rmap[i];
1763       irow_i = irow[i];
1764       jmax   = nrow[i];
1765       for (j=0; j<jmax; j++) {
1766         l   = 0;
1767         row = irow_i[j];
1768         while (row >= C->rmap->range[l+1]) l++;
1769         proc = l;
1770         if (proc == rank) {
1771           old_row = row;
1772 #if defined(PETSC_USE_CTABLE)
1773           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1774           row--;
1775 #else
1776           row = rmap_i[row];
1777 #endif
1778           ilen_row = imat_ilen[row];
1779           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1780           mat_i    = imat_i[row];
1781           mat_a    = imat_a + mat_i;
1782           mat_j    = imat_j + mat_i;
1783           if (!allcolumns[i]) {
1784             for (k=0; k<ncols; k++) {
1785 #if defined(PETSC_USE_CTABLE)
1786               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1787 #else
1788               tcol = cmap_i[cols[k]];
1789 #endif
1790               if (tcol) {
1791                 *mat_j++ = tcol - 1;
1792                 *mat_a++ = vals[k];
1793                 ilen_row++;
1794               }
1795             }
1796           } else { /* allcolumns */
1797             for (k=0; k<ncols; k++) {
1798               *mat_j++ = cols[k];  /* global col index! */
1799               *mat_a++ = vals[k];
1800               ilen_row++;
1801             }
1802           }
1803           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1804 
1805           imat_ilen[row] = ilen_row;
1806         }
1807       }
1808     }
1809   }
1810 
1811   /*   Now assemble the off proc rows*/
1812   {
1813     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1814     PetscInt    *imat_j,*imat_i;
1815     PetscScalar *imat_a,*rbuf4_i;
1816 
1817     for (tmp2=0; tmp2<nrqs; tmp2++) {
1818       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1819       idex    = pa[idex2];
1820       sbuf1_i = sbuf1[idex];
1821       jmax    = sbuf1_i[0];
1822       ct1     = 2*jmax + 1;
1823       ct2     = 0;
1824       rbuf2_i = rbuf2[idex2];
1825       rbuf3_i = rbuf3[idex2];
1826       rbuf4_i = rbuf4[idex2];
1827       for (j=1; j<=jmax; j++) {
1828         is_no     = sbuf1_i[2*j-1];
1829         rmap_i    = rmap[is_no];
1830         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1831         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1832         imat_ilen = mat->ilen;
1833         imat_j    = mat->j;
1834         imat_i    = mat->i;
1835         imat_a    = mat->a;
1836         max1      = sbuf1_i[2*j];
1837         for (k=0; k<max1; k++,ct1++) {
1838           row = sbuf1_i[ct1];
1839 #if defined(PETSC_USE_CTABLE)
1840           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1841           row--;
1842 #else
1843           row = rmap_i[row];
1844 #endif
1845           ilen  = imat_ilen[row];
1846           mat_i = imat_i[row];
1847           mat_a = imat_a + mat_i;
1848           mat_j = imat_j + mat_i;
1849           max2  = rbuf2_i[ct1];
1850           if (!allcolumns[is_no]) {
1851             for (l=0; l<max2; l++,ct2++) {
1852 
1853 #if defined(PETSC_USE_CTABLE)
1854               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1855 #else
1856               tcol = cmap_i[rbuf3_i[ct2]];
1857 #endif
1858               if (tcol) {
1859                 *mat_j++ = tcol - 1;
1860                 *mat_a++ = rbuf4_i[ct2];
1861                 ilen++;
1862               }
1863             }
1864           } else { /* allcolumns */
1865             for (l=0; l<max2; l++,ct2++) {
1866               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1867               *mat_a++ = rbuf4_i[ct2];
1868               ilen++;
1869             }
1870           }
1871           imat_ilen[row] = ilen;
1872         }
1873       }
1874     }
1875   }
1876 
1877   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1878   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1879   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1880   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1881   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1882 
1883   /* Restore the indices */
1884   for (i=0; i<ismax; i++) {
1885     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1886     if (!allcolumns[i]) {
1887       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1888     }
1889   }
1890 
1891   /* Destroy allocated memory */
1892   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1893   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1894   ierr = PetscFree(pa);CHKERRQ(ierr);
1895 
1896   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1897   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1898   for (i=0; i<nrqr; ++i) {
1899     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1900   }
1901   for (i=0; i<nrqs; ++i) {
1902     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1903     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1904   }
1905 
1906   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1907   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1908   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1909   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1910   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1911   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1912   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1913 
1914 #if defined(PETSC_USE_CTABLE)
1915   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1916 #else
1917   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1918 #endif
1919   ierr = PetscFree(rmap);CHKERRQ(ierr);
1920 
1921   for (i=0; i<ismax; i++) {
1922     if (!allcolumns[i]) {
1923 #if defined(PETSC_USE_CTABLE)
1924       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1925 #else
1926       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1927 #endif
1928     }
1929   }
1930   ierr = PetscFree(cmap);CHKERRQ(ierr);
1931   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1932   ierr = PetscFree(lens);CHKERRQ(ierr);
1933 
1934   for (i=0; i<ismax; i++) {
1935     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1936     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1937   }
1938   PetscFunctionReturn(0);
1939 }
1940 
1941 /*
1942  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1943  Be careful not to destroy them elsewhere.
1944  */
1945 #undef __FUNCT__
1946 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1947 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1948 {
1949   /* If making this function public, change the error returned in this function away from _PLIB. */
1950   PetscErrorCode ierr;
1951   Mat_MPIAIJ     *aij;
1952   PetscBool      seqaij;
1953 
1954   PetscFunctionBegin;
1955   /* Check to make sure the component matrices are compatible with C. */
1956   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1957   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1958   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1959   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1960   if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1961 
1962   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1963   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1964   ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr);
1965   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1966   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1967   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1968   ierr   = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1969   aij    = (Mat_MPIAIJ*)((*C)->data);
1970   aij->A = A;
1971   aij->B = B;
1972   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr);
1973   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr);
1974 
1975   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1976   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1977   PetscFunctionReturn(0);
1978 }
1979 
1980 #undef __FUNCT__
1981 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1982 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1983 {
1984   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1985 
1986   PetscFunctionBegin;
1987   PetscValidPointer(A,2);
1988   PetscValidPointer(B,3);
1989   *A = aij->A;
1990   *B = aij->B;
1991   /* Note that we don't incref *A and *B, so be careful! */
1992   PetscFunctionReturn(0);
1993 }
1994 
1995 #undef __FUNCT__
1996 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1997 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1998                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1999                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
2000                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
2001 {
2002   PetscErrorCode ierr;
2003   PetscMPIInt    size, flag;
2004   PetscInt       i,ii;
2005   PetscInt       ismax_c;
2006 
2007   PetscFunctionBegin;
2008   if (!ismax) PetscFunctionReturn(0);
2009 
2010   for (i = 0, ismax_c = 0; i < ismax; ++i) {
2011     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
2012     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2013     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2014     if (size > 1) ++ismax_c;
2015   }
2016   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
2017     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2018   } else { /* if (ismax_c) */
2019     Mat         *A,*B;
2020     IS          *isrow_c, *iscol_c;
2021     PetscMPIInt size;
2022     /*
2023      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2024      array of sequential matrices underlying the resulting parallel matrices.
2025      Which arrays to allocate is based on the value of MatReuse scall.
2026      There are as many diag matrices as there are original index sets.
2027      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
2028 
2029      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
2030      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
2031      will deposite the extracted diag and off-diag parts.
2032      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
2033      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
2034     */
2035 
2036     /* Parallel matrix array is allocated only if no reuse is taking place. */
2037     if (scall != MAT_REUSE_MATRIX) {
2038       ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
2039     } else {
2040       ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr);
2041       ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr);
2042       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
2043       for (i = 0, ii = 0; i < ismax; ++i) {
2044         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2045         if (size > 1) {
2046           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
2047           ++ii;
2048         } else A[i] = (*submat)[i];
2049       }
2050     }
2051     /*
2052      Construct the complements of the iscol ISs for parallel ISs only.
2053      These are used to extract the off-diag portion of the resulting parallel matrix.
2054      The row IS for the off-diag portion is the same as for the diag portion,
2055      so we merely alias the row IS, while skipping those that are sequential.
2056     */
2057     ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr);
2058     for (i = 0, ii = 0; i < ismax; ++i) {
2059       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2060       if (size > 1) {
2061         isrow_c[ii] = isrow[i];
2062 
2063         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
2064         ++ii;
2065       }
2066     }
2067     /* Now obtain the sequential A and B submatrices separately. */
2068     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
2069     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
2070     for (ii = 0; ii < ismax_c; ++ii) {
2071       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
2072     }
2073     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
2074     /*
2075      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
2076      have been extracted directly into the parallel matrices containing them, or
2077      simply into the sequential matrix identical with the corresponding A (if size == 1).
2078      Otherwise, make sure that parallel matrices are constructed from A & B, or the
2079      A is put into the correct submat slot (if size == 1).
2080      */
2081     if (scall != MAT_REUSE_MATRIX) {
2082       for (i = 0, ii = 0; i < ismax; ++i) {
2083         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2084         if (size > 1) {
2085           /*
2086            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
2087            */
2088           /* Construct submat[i] from the Seq pieces A and B. */
2089           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
2090 
2091           ++ii;
2092         } else (*submat)[i] = A[i];
2093       }
2094     }
2095     ierr = PetscFree(A);CHKERRQ(ierr);
2096     ierr = PetscFree(B);CHKERRQ(ierr);
2097   }
2098   PetscFunctionReturn(0);
2099 } /* MatGetSubMatricesParallel_MPIXAIJ() */
2100 
2101 
2102 
2103 #undef __FUNCT__
2104 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
2105 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2106 {
2107   PetscErrorCode ierr;
2108 
2109   PetscFunctionBegin;
2110   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
2111   PetscFunctionReturn(0);
2112 }
2113