xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision f9185b66528dabef5134ab6cdcf228fe8b971b0e)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscInt       i;
28 
29   PetscFunctionBegin;
30   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
31   for (i=0; i<ov; ++i) {
32     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
33   }
34   PetscFunctionReturn(0);
35 }
36 
37 #undef __FUNCT__
38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
40 {
41   PetscErrorCode ierr;
42   PetscInt       i;
43 
44   PetscFunctionBegin;
45   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
46   for (i=0; i<ov; ++i) {
47     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
48   }
49   PetscFunctionReturn(0);
50 }
51 
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
56 {
57   PetscErrorCode   ierr;
58   MPI_Comm         comm;
59   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
60   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
61   PetscInt         nrecvrows,*sbsizes,*sbdata,nto,nfrom;
62   const PetscInt **indices,*indices_i;
63   PetscLayout      rmap;
64   PetscMPIInt      rank,size,*toranks,*fromranks;
65   PetscSF          sf;
66   PetscSFNode     *remote;
67 
68   PetscFunctionBegin;
69   /*communicator */
70   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
71   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
72   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
73   /*row map*/
74   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
75   /*retrieve IS data*/
76   ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr);
77   /*get length and indices*/
78   for(i=0,tlength=0; i<nidx; i++){
79     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
80     tlength += length[i];
81     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
82   }
83   /*find these rows on remote processors */
84   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
85   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
86   nrrows = 0;
87   for(i=0; i<nidx; i++){
88 	length_i     = length[i];
89 	indices_i    = indices[i];
90     for(j=0; j<length_i; j++){
91       owner = -1;
92       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
93       /*remote processors*/
94       if(owner != rank){
95     	tosizes_temp[owner]++; /*number of rows to owner*/
96     	rrow_ranks[nrrows]  = owner; /*processor */
97     	rrow_isids[nrrows]   = i; /*is id*/
98         remoterows[nrrows++] = indices_i[j]; /* row */
99       }
100     }
101     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
102   }
103   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
104   /*test if we need to exchange messages
105    * generally speaking, we do not need to exchange
106    * data when overlap is 1
107    * */
108   ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
109   /*we do not have any messages
110    * It usually corresponds to overlap 1
111    * */
112   if(!reducednrrows){
113 	ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
114 	ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
115 	ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
116 	PetscFunctionReturn(0);
117   }
118   nto = 0;
119   /*send sizes and ranks*/
120   for(i=0; i<size; i++){
121    if(tosizes_temp[i]){
122     tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
123     tosizes_temp[i] = nto; /* a map from processor to index */
124 	toranks[nto++]  = i; /* processor */
125    }
126   }
127   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
128   for(i=0; i<nto; i++){
129 	toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /*offsets*/
130 	tosizes[2*i+1] = toffsets[i]; /*offsets to send*/
131   }
132   /*send information to other processors*/
133   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
134   /*build a star forest */
135   nrecvrows = 0;
136   for(i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
137   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
138   nrecvrows = 0;
139   for(i=0; i<nfrom; i++){
140 	for(j=0; j<fromsizes[2*i]; j++){
141 	  remote[nrecvrows].rank    = fromranks[i];
142       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
143 	}
144   }
145   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
146   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
147   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
148   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
149   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
150   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
151   /*message pair <no of is, row> */
152   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
153   for(i=0; i<nrrows; i++){
154 	owner = rrow_ranks[i]; /* processor */
155 	j     = tosizes_temp[owner]; /* index */
156 	todata[toffsets[j]++] = rrow_isids[i];
157 	todata[toffsets[j]++] = remoterows[i];
158   }
159   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
160   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
161   ierr = PetscFree(toffsets);CHKERRQ(ierr);
162   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
163   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
164   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
165   /*deal with remote data */
166   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
167   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
168   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
169   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
170   ierr = PetscFree(fromranks);CHKERRQ(ierr);
171   nrecvrows = 0;
172   for(i=0; i<nto; i++) nrecvrows += tosizes[2*i];
173   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
174   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
175   nrecvrows = 0;
176   for(i=0; i<nto; i++){
177   	for(j=0; j<tosizes[2*i]; j++){
178   	  remote[nrecvrows].rank    = toranks[i];
179       remote[nrecvrows++].index = tosizes[2*i+1]+j;
180   	}
181   }
182   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
183   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
184   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
185   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
186   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
187   /*overlap communication and computation*/
188   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
189   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
190   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
191   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
192   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
193   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
194   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
195   ierr = PetscFree(toranks);CHKERRQ(ierr);
196   ierr = PetscFree(tosizes);CHKERRQ(ierr);
197   ierr = PetscFree(todata);CHKERRQ(ierr);
198   PetscFunctionReturn(0);
199 }
200 
201 #undef __FUNCT__
202 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
203 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
204 {
205   PetscInt         *isz,isz_i,i,j,is_id, data_size;
206   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
207   const PetscInt   *indices_i_temp;
208   PetscErrorCode    ierr;
209 
210   PetscFunctionBegin;
211   max_lsize = 0;
212   ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr);
213   for(i=0; i<nidx; i++){
214    ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
215    max_lsize = lsize>max_lsize ? lsize:max_lsize;
216    isz[i]    = lsize;
217   }
218   ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr);
219   for(i=0; i<nidx; i++){
220     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
221 	ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
222 	ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
223 	ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
224   }
225   /*retrieve information */
226   for(i=0; i<nrecvs; ){
227 	is_id      = recvdata[i++];
228 	data_size  = recvdata[i++];
229 	indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
230 	isz_i      = isz[is_id];
231     for(j=0; j< data_size; j++){
232       col = recvdata[i++];
233       indices_i[isz_i++] = col;
234     }
235     isz[is_id] = isz_i;
236   }
237   /*remove duplicate entities*/
238   for(i=0; i<nidx; i++){
239    indices_i  = indices_temp+(max_lsize+nrecvs)*i;
240    isz_i      = isz[i];
241    ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
242    ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
243   }
244   ierr = PetscFree(isz);CHKERRQ(ierr);
245   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
246   PetscFunctionReturn(0);
247 }
248 
249 #undef __FUNCT__
250 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
251 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
252 {
253   PetscLayout       rmap,cmap;
254   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
255   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
256   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
257   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
258   Mat               amat,bmat;
259   PetscMPIInt       rank;
260   PetscBool         done;
261   MPI_Comm          comm;
262   PetscErrorCode    ierr;
263 
264   PetscFunctionBegin;
265   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
266   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
267   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
268   /* Even if the mat is symmetric, we still assume it is not symmetric*/
269   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
270   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
271   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
272   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
273   /*total number of nonzero values */
274   tnz  = ai[an]+bi[bn];
275   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
276   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
277   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
278   /*longest message */
279   max_fszs = 0;
280   for(i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
281   /*better way to estimate number of nonzero in the mat???*/
282   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
283   for(i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
284   rows_pos  = 0;
285   totalrows = 0;
286   for(i=0; i<nfrom; i++){
287 	ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
288 	/*group data*/
289     for(j=0; j<fromsizes[2*i]; j+=2){
290       is_id                       = fromrows[rows_pos++];/*no of is*/
291       rows_i                      = rows_data[is_id];
292       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
293     }
294     /*estimate a space to avoid multiple allocations  */
295     for(j=0; j<nidx; j++){
296       indvc_ij = 0;
297       rows_i   = rows_data[j];
298       for(l=0; l<rows_pos_i[j]; l++){
299         row    = rows_i[l]-rstart;
300         start  = ai[row];
301         end    = ai[row+1];
302         for(k=start; k<end; k++){ /* Amat */
303           col = aj[k] + cstart;
304           /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr);
305           if(owner != fromranks[i])*/
306           indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/
307         }
308         start = bi[row];
309         end   = bi[row+1];
310         for(k=start; k<end; k++) { /* Bmat */
311           col = gcols[bj[k]];
312           /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr);
313           if(owner != fromranks[i]) */
314           indices_tmp[indvc_ij++] = col;
315         }
316       }
317       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
318       indv_counts[i*nidx+j] = indvc_ij;
319       totalrows            += indvc_ij;
320     }
321   }
322   /*message triple <no of is, number of rows, rows> */
323   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
324   totalrows = 0;
325   rows_pos  = 0;
326   /* use this code again */
327   for(i=0;i<nfrom;i++){
328   	ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
329     for(j=0; j<fromsizes[2*i]; j+=2){
330       is_id                       = fromrows[rows_pos++];
331       rows_i                      = rows_data[is_id];
332       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
333     }
334     /* add data  */
335     for(j=0; j<nidx; j++){
336       if(!indv_counts[i*nidx+j]) continue;
337       indvc_ij = 0;
338       sbdata[totalrows++] = j;
339       sbdata[totalrows++] = indv_counts[i*nidx+j];
340       sbsizes[2*i]       += 2;
341       rows_i              = rows_data[j];
342       for(l=0; l<rows_pos_i[j]; l++){
343         row   = rows_i[l]-rstart;
344         start = ai[row];
345         end   = ai[row+1];
346         for(k=start; k<end; k++){ /* Amat */
347           col = aj[k] + cstart;
348           indices_tmp[indvc_ij++] = col;
349         }
350         start = bi[row];
351         end   = bi[row+1];
352         for(k=start; k<end; k++) { /* Bmat */
353           col = gcols[bj[k]];
354           indices_tmp[indvc_ij++] = col;
355         }
356       }
357       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
358       sbsizes[2*i]  += indvc_ij;
359       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
360       totalrows += indvc_ij;
361     }
362   }
363   /* offsets */
364   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
365   for(i=0; i<nfrom; i++){
366 	offsets[i+1]   = offsets[i] + sbsizes[2*i];
367 	sbsizes[2*i+1] = offsets[i];
368   }
369   ierr = PetscFree(offsets);CHKERRQ(ierr);
370   if(sbrowsizes) *sbrowsizes = sbsizes;
371   if(sbrows) *sbrows = sbdata;
372   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
373   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
374   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
375   PetscFunctionReturn(0);
376 }
377 
378 #undef __FUNCT__
379 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
380 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
381 {
382   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
383   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
384   PetscInt          lsize,lsize_tmp,owner;
385   PetscMPIInt       rank;
386   Mat   	        amat,bmat;
387   PetscBool         done;
388   PetscLayout       cmap,rmap;
389   MPI_Comm          comm;
390   PetscErrorCode    ierr;
391 
392   PetscFunctionBegin;
393   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
394   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
395   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
396   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
397   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
398   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
399   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
400   /*is it a safe way to compute number of nonzero values ?*/
401   tnz  = ai[an]+bi[bn];
402   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
403   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
404   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
405   /*it is a better way to estimate memory than the old implementation
406    * where global size of matrix is used
407    * */
408   ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr);
409   for(i=0; i<nidx; i++) {
410 	ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
411 	ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
412 	lsize_tmp = 0;
413     for(j=0; j<lsize; j++) {
414        owner = -1;
415        row   = indices[j];
416        ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
417        if(owner != rank) continue;
418        /*local number*/
419        row  -= rstart;
420        start = ai[row];
421        end   = ai[row+1];
422        for(k=start; k<end; k++) { /* Amat */
423          col = aj[k] + cstart;
424          indices_temp[lsize_tmp++] = col;
425        }
426        start = bi[row];
427        end   = bi[row+1];
428        for(k=start; k<end; k++) { /* Bmat */
429          col = gcols[bj[k]];
430          indices_temp[lsize_tmp++] = col;
431        }
432     }
433    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
434    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
435    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
436    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
437   }
438   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
439   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
440   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
441   PetscFunctionReturn(0);
442 }
443 
444 
445 /*
446   Sample message format:
447   If a processor A wants processor B to process some elements corresponding
448   to index sets is[1],is[5]
449   mesg [0] = 2   (no of index sets in the mesg)
450   -----------
451   mesg [1] = 1 => is[1]
452   mesg [2] = sizeof(is[1]);
453   -----------
454   mesg [3] = 5  => is[5]
455   mesg [4] = sizeof(is[5]);
456   -----------
457   mesg [5]
458   mesg [n]  datas[1]
459   -----------
460   mesg[n+1]
461   mesg[m]  data(is[5])
462   -----------
463 
464   Notes:
465   nrqs - no of requests sent (or to be sent out)
466   nrqr - no of requests recieved (which have to be or which have been processed
467 */
468 #undef __FUNCT__
469 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
470 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
471 {
472   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
473   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
474   const PetscInt **idx,*idx_i;
475   PetscInt       *n,**data,len;
476   PetscErrorCode ierr;
477   PetscMPIInt    size,rank,tag1,tag2;
478   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
479   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
480   PetscBT        *table;
481   MPI_Comm       comm;
482   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
483   MPI_Status     *s_status,*recv_status;
484   char           *t_p;
485 
486   PetscFunctionBegin;
487   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
488   size = c->size;
489   rank = c->rank;
490   M    = C->rmap->N;
491 
492   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
493   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
494 
495   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
496 
497   for (i=0; i<imax; i++) {
498     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
499     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
500   }
501 
502   /* evaluate communication - mesg to who,length of mesg, and buffer space
503      required. Based on this, buffers are allocated, and data copied into them*/
504   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
505   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
506   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
507   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
508   for (i=0; i<imax; i++) {
509     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
510     idx_i = idx[i];
511     len   = n[i];
512     for (j=0; j<len; j++) {
513       row = idx_i[j];
514       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
515       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
516       w4[proc]++;
517     }
518     for (j=0; j<size; j++) {
519       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
520     }
521   }
522 
523   nrqs     = 0;              /* no of outgoing messages */
524   msz      = 0;              /* total mesg length (for all proc */
525   w1[rank] = 0;              /* no mesg sent to intself */
526   w3[rank] = 0;
527   for (i=0; i<size; i++) {
528     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
529   }
530   /* pa - is list of processors to communicate with */
531   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
532   for (i=0,j=0; i<size; i++) {
533     if (w1[i]) {pa[j] = i; j++;}
534   }
535 
536   /* Each message would have a header = 1 + 2*(no of IS) + data */
537   for (i=0; i<nrqs; i++) {
538     j      = pa[i];
539     w1[j] += w2[j] + 2*w3[j];
540     msz   += w1[j];
541   }
542 
543   /* Determine the number of messages to expect, their lengths, from from-ids */
544   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
545   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
546 
547   /* Now post the Irecvs corresponding to these messages */
548   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
549 
550   /* Allocate Memory for outgoing messages */
551   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
552   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
553   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
554 
555   {
556     PetscInt *iptr = tmp,ict  = 0;
557     for (i=0; i<nrqs; i++) {
558       j         = pa[i];
559       iptr     +=  ict;
560       outdat[j] = iptr;
561       ict       = w1[j];
562     }
563   }
564 
565   /* Form the outgoing messages */
566   /*plug in the headers*/
567   for (i=0; i<nrqs; i++) {
568     j            = pa[i];
569     outdat[j][0] = 0;
570     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
571     ptr[j]       = outdat[j] + 2*w3[j] + 1;
572   }
573 
574   /* Memory for doing local proc's work*/
575   {
576     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
577 
578     for (i=0; i<imax; i++) {
579       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
580       data[i]  = d_p + M*i;
581     }
582   }
583 
584   /* Parse the IS and update local tables and the outgoing buf with the data*/
585   {
586     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
587     PetscBT  table_i;
588 
589     for (i=0; i<imax; i++) {
590       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
591       n_i     = n[i];
592       table_i = table[i];
593       idx_i   = idx[i];
594       data_i  = data[i];
595       isz_i   = isz[i];
596       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
597         row  = idx_i[j];
598         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
599         if (proc != rank) { /* copy to the outgoing buffer */
600           ctr[proc]++;
601           *ptr[proc] = row;
602           ptr[proc]++;
603         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
604       }
605       /* Update the headers for the current IS */
606       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
607         if ((ctr_j = ctr[j])) {
608           outdat_j        = outdat[j];
609           k               = ++outdat_j[0];
610           outdat_j[2*k]   = ctr_j;
611           outdat_j[2*k-1] = i;
612         }
613       }
614       isz[i] = isz_i;
615     }
616   }
617 
618   /*  Now  post the sends */
619   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
620   for (i=0; i<nrqs; ++i) {
621     j    = pa[i];
622     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
623   }
624 
625   /* No longer need the original indices*/
626   for (i=0; i<imax; ++i) {
627     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
628   }
629   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
630 
631   for (i=0; i<imax; ++i) {
632     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
633   }
634 
635   /* Do Local work*/
636   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
637 
638   /* Receive messages*/
639   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
640   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
641 
642   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
643   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
644 
645   /* Phase 1 sends are complete - deallocate buffers */
646   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
647   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
648 
649   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
650   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
651   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
652   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
653   ierr = PetscFree(rbuf);CHKERRQ(ierr);
654 
655 
656   /* Send the data back*/
657   /* Do a global reduction to know the buffer space req for incoming messages*/
658   {
659     PetscMPIInt *rw1;
660 
661     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
662 
663     for (i=0; i<nrqr; ++i) {
664       proc = recv_status[i].MPI_SOURCE;
665 
666       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
667       rw1[proc] = isz1[i];
668     }
669     ierr = PetscFree(onodes1);CHKERRQ(ierr);
670     ierr = PetscFree(olengths1);CHKERRQ(ierr);
671 
672     /* Determine the number of messages to expect, their lengths, from from-ids */
673     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
674     ierr = PetscFree(rw1);CHKERRQ(ierr);
675   }
676   /* Now post the Irecvs corresponding to these messages */
677   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
678 
679   /*  Now  post the sends */
680   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
681   for (i=0; i<nrqr; ++i) {
682     j    = recv_status[i].MPI_SOURCE;
683     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
684   }
685 
686   /* receive work done on other processors*/
687   {
688     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
689     PetscMPIInt idex;
690     PetscBT     table_i;
691     MPI_Status  *status2;
692 
693     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
694     for (i=0; i<nrqs; ++i) {
695       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
696       /* Process the message*/
697       rbuf2_i = rbuf2[idex];
698       ct1     = 2*rbuf2_i[0]+1;
699       jmax    = rbuf2[idex][0];
700       for (j=1; j<=jmax; j++) {
701         max     = rbuf2_i[2*j];
702         is_no   = rbuf2_i[2*j-1];
703         isz_i   = isz[is_no];
704         data_i  = data[is_no];
705         table_i = table[is_no];
706         for (k=0; k<max; k++,ct1++) {
707           row = rbuf2_i[ct1];
708           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
709         }
710         isz[is_no] = isz_i;
711       }
712     }
713 
714     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
715     ierr = PetscFree(status2);CHKERRQ(ierr);
716   }
717 
718   for (i=0; i<imax; ++i) {
719     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
720   }
721 
722   ierr = PetscFree(onodes2);CHKERRQ(ierr);
723   ierr = PetscFree(olengths2);CHKERRQ(ierr);
724 
725   ierr = PetscFree(pa);CHKERRQ(ierr);
726   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
727   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
728   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
729   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
730   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
731   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
732   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
733   ierr = PetscFree(s_status);CHKERRQ(ierr);
734   ierr = PetscFree(recv_status);CHKERRQ(ierr);
735   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
736   ierr = PetscFree(xdata);CHKERRQ(ierr);
737   ierr = PetscFree(isz1);CHKERRQ(ierr);
738   PetscFunctionReturn(0);
739 }
740 
741 #undef __FUNCT__
742 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
743 /*
744    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
745        the work on the local processor.
746 
747      Inputs:
748       C      - MAT_MPIAIJ;
749       imax - total no of index sets processed at a time;
750       table  - an array of char - size = m bits.
751 
752      Output:
753       isz    - array containing the count of the solution elements corresponding
754                to each index set;
755       data   - pointer to the solutions
756 */
757 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
758 {
759   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
760   Mat        A  = c->A,B = c->B;
761   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
762   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
763   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
764   PetscBT    table_i;
765 
766   PetscFunctionBegin;
767   rstart = C->rmap->rstart;
768   cstart = C->cmap->rstart;
769   ai     = a->i;
770   aj     = a->j;
771   bi     = b->i;
772   bj     = b->j;
773   garray = c->garray;
774 
775 
776   for (i=0; i<imax; i++) {
777     data_i  = data[i];
778     table_i = table[i];
779     isz_i   = isz[i];
780     for (j=0,max=isz[i]; j<max; j++) {
781       row   = data_i[j] - rstart;
782       start = ai[row];
783       end   = ai[row+1];
784       for (k=start; k<end; k++) { /* Amat */
785         val = aj[k] + cstart;
786         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
787       }
788       start = bi[row];
789       end   = bi[row+1];
790       for (k=start; k<end; k++) { /* Bmat */
791         val = garray[bj[k]];
792         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
793       }
794     }
795     isz[i] = isz_i;
796   }
797   PetscFunctionReturn(0);
798 }
799 
800 #undef __FUNCT__
801 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
802 /*
803       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
804          and return the output
805 
806          Input:
807            C    - the matrix
808            nrqr - no of messages being processed.
809            rbuf - an array of pointers to the recieved requests
810 
811          Output:
812            xdata - array of messages to be sent back
813            isz1  - size of each message
814 
815   For better efficiency perhaps we should malloc separately each xdata[i],
816 then if a remalloc is required we need only copy the data for that one row
817 rather then all previous rows as it is now where a single large chunck of
818 memory is used.
819 
820 */
821 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
822 {
823   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
824   Mat            A  = c->A,B = c->B;
825   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
826   PetscErrorCode ierr;
827   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
828   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
829   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
830   PetscInt       *rbuf_i,kmax,rbuf_0;
831   PetscBT        xtable;
832 
833   PetscFunctionBegin;
834   m      = C->rmap->N;
835   rstart = C->rmap->rstart;
836   cstart = C->cmap->rstart;
837   ai     = a->i;
838   aj     = a->j;
839   bi     = b->i;
840   bj     = b->j;
841   garray = c->garray;
842 
843 
844   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
845     rbuf_i =  rbuf[i];
846     rbuf_0 =  rbuf_i[0];
847     ct    += rbuf_0;
848     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
849   }
850 
851   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
852   else max1 = 1;
853   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
854   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
855   ++no_malloc;
856   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
857   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
858 
859   ct3 = 0;
860   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
861     rbuf_i =  rbuf[i];
862     rbuf_0 =  rbuf_i[0];
863     ct1    =  2*rbuf_0+1;
864     ct2    =  ct1;
865     ct3   += ct1;
866     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
867       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
868       oct2 = ct2;
869       kmax = rbuf_i[2*j];
870       for (k=0; k<kmax; k++,ct1++) {
871         row = rbuf_i[ct1];
872         if (!PetscBTLookupSet(xtable,row)) {
873           if (!(ct3 < mem_estimate)) {
874             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
875             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
876             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
877             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
878             xdata[0]     = tmp;
879             mem_estimate = new_estimate; ++no_malloc;
880             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
881           }
882           xdata[i][ct2++] = row;
883           ct3++;
884         }
885       }
886       for (k=oct2,max2=ct2; k<max2; k++) {
887         row   = xdata[i][k] - rstart;
888         start = ai[row];
889         end   = ai[row+1];
890         for (l=start; l<end; l++) {
891           val = aj[l] + cstart;
892           if (!PetscBTLookupSet(xtable,val)) {
893             if (!(ct3 < mem_estimate)) {
894               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
895               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
896               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
897               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
898               xdata[0]     = tmp;
899               mem_estimate = new_estimate; ++no_malloc;
900               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
901             }
902             xdata[i][ct2++] = val;
903             ct3++;
904           }
905         }
906         start = bi[row];
907         end   = bi[row+1];
908         for (l=start; l<end; l++) {
909           val = garray[bj[l]];
910           if (!PetscBTLookupSet(xtable,val)) {
911             if (!(ct3 < mem_estimate)) {
912               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
913               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
914               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
915               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
916               xdata[0]     = tmp;
917               mem_estimate = new_estimate; ++no_malloc;
918               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
919             }
920             xdata[i][ct2++] = val;
921             ct3++;
922           }
923         }
924       }
925       /* Update the header*/
926       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
927       xdata[i][2*j-1] = rbuf_i[2*j-1];
928     }
929     xdata[i][0] = rbuf_0;
930     xdata[i+1]  = xdata[i] + ct2;
931     isz1[i]     = ct2; /* size of each message */
932   }
933   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
934   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
935   PetscFunctionReturn(0);
936 }
937 /* -------------------------------------------------------------------------*/
938 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
939 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
940 /*
941     Every processor gets the entire matrix
942 */
943 #undef __FUNCT__
944 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
945 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
946 {
947   Mat            B;
948   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
949   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
950   PetscErrorCode ierr;
951   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
952   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
953   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
954   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
955 
956   PetscFunctionBegin;
957   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
958   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
959 
960   if (scall == MAT_INITIAL_MATRIX) {
961     /* ----------------------------------------------------------------
962          Tell every processor the number of nonzeros per row
963     */
964     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
965     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
966       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
967     }
968     sendcount = A->rmap->rend - A->rmap->rstart;
969     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
970     for (i=0; i<size; i++) {
971       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
972       displs[i]     = A->rmap->range[i];
973     }
974 #if defined(PETSC_HAVE_MPI_IN_PLACE)
975     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
976 #else
977     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
978 #endif
979     /* ---------------------------------------------------------------
980          Create the sequential matrix of the same type as the local block diagonal
981     */
982     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
983     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
984     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
985     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
986     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
987     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
988     **Bin = B;
989     b     = (Mat_SeqAIJ*)B->data;
990 
991     /*--------------------------------------------------------------------
992        Copy my part of matrix column indices over
993     */
994     sendcount  = ad->nz + bd->nz;
995     jsendbuf   = b->j + b->i[rstarts[rank]];
996     a_jsendbuf = ad->j;
997     b_jsendbuf = bd->j;
998     n          = A->rmap->rend - A->rmap->rstart;
999     cnt        = 0;
1000     for (i=0; i<n; i++) {
1001 
1002       /* put in lower diagonal portion */
1003       m = bd->i[i+1] - bd->i[i];
1004       while (m > 0) {
1005         /* is it above diagonal (in bd (compressed) numbering) */
1006         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1007         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1008         m--;
1009       }
1010 
1011       /* put in diagonal portion */
1012       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1013         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1014       }
1015 
1016       /* put in upper diagonal portion */
1017       while (m-- > 0) {
1018         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1019       }
1020     }
1021     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1022 
1023     /*--------------------------------------------------------------------
1024        Gather all column indices to all processors
1025     */
1026     for (i=0; i<size; i++) {
1027       recvcounts[i] = 0;
1028       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1029         recvcounts[i] += lens[j];
1030       }
1031     }
1032     displs[0] = 0;
1033     for (i=1; i<size; i++) {
1034       displs[i] = displs[i-1] + recvcounts[i-1];
1035     }
1036 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1037     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1038 #else
1039     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1040 #endif
1041     /*--------------------------------------------------------------------
1042         Assemble the matrix into useable form (note numerical values not yet set)
1043     */
1044     /* set the b->ilen (length of each row) values */
1045     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1046     /* set the b->i indices */
1047     b->i[0] = 0;
1048     for (i=1; i<=A->rmap->N; i++) {
1049       b->i[i] = b->i[i-1] + lens[i-1];
1050     }
1051     ierr = PetscFree(lens);CHKERRQ(ierr);
1052     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1053     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1054 
1055   } else {
1056     B = **Bin;
1057     b = (Mat_SeqAIJ*)B->data;
1058   }
1059 
1060   /*--------------------------------------------------------------------
1061        Copy my part of matrix numerical values into the values location
1062   */
1063   if (flag == MAT_GET_VALUES) {
1064     sendcount = ad->nz + bd->nz;
1065     sendbuf   = b->a + b->i[rstarts[rank]];
1066     a_sendbuf = ad->a;
1067     b_sendbuf = bd->a;
1068     b_sendj   = bd->j;
1069     n         = A->rmap->rend - A->rmap->rstart;
1070     cnt       = 0;
1071     for (i=0; i<n; i++) {
1072 
1073       /* put in lower diagonal portion */
1074       m = bd->i[i+1] - bd->i[i];
1075       while (m > 0) {
1076         /* is it above diagonal (in bd (compressed) numbering) */
1077         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1078         sendbuf[cnt++] = *b_sendbuf++;
1079         m--;
1080         b_sendj++;
1081       }
1082 
1083       /* put in diagonal portion */
1084       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1085         sendbuf[cnt++] = *a_sendbuf++;
1086       }
1087 
1088       /* put in upper diagonal portion */
1089       while (m-- > 0) {
1090         sendbuf[cnt++] = *b_sendbuf++;
1091         b_sendj++;
1092       }
1093     }
1094     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1095 
1096     /* -----------------------------------------------------------------
1097        Gather all numerical values to all processors
1098     */
1099     if (!recvcounts) {
1100       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1101     }
1102     for (i=0; i<size; i++) {
1103       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1104     }
1105     displs[0] = 0;
1106     for (i=1; i<size; i++) {
1107       displs[i] = displs[i-1] + recvcounts[i-1];
1108     }
1109     recvbuf = b->a;
1110 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1111     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1112 #else
1113     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1114 #endif
1115   }  /* endof (flag == MAT_GET_VALUES) */
1116   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1117 
1118   if (A->symmetric) {
1119     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1120   } else if (A->hermitian) {
1121     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1122   } else if (A->structurally_symmetric) {
1123     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1124   }
1125   PetscFunctionReturn(0);
1126 }
1127 
1128 
1129 
1130 #undef __FUNCT__
1131 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
1132 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1133 {
1134   PetscErrorCode ierr;
1135   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1136   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1137 
1138   PetscFunctionBegin;
1139 
1140   /*
1141        Check for special case: each processor gets entire matrix
1142   */
1143   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1144     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1145     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1146     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1147     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1148     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1149       wantallmatrix = PETSC_TRUE;
1150 
1151       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1152     }
1153   }
1154   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1155   if (twantallmatrix) {
1156     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
1157     PetscFunctionReturn(0);
1158   }
1159 
1160   /* Allocate memory to hold all the submatrices */
1161   if (scall != MAT_REUSE_MATRIX) {
1162     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
1163   }
1164 
1165   /* Check for special case: each processor gets entire matrix columns */
1166   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
1167   for (i=0; i<ismax; i++) {
1168     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
1169     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
1170     if (colflag && ncol == C->cmap->N) {
1171       allcolumns[i] = PETSC_TRUE;
1172     } else {
1173       allcolumns[i] = PETSC_FALSE;
1174     }
1175   }
1176 
1177   /* Determine the number of stages through which submatrices are done */
1178   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1179 
1180   /*
1181      Each stage will extract nmax submatrices.
1182      nmax is determined by the matrix column dimension.
1183      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1184   */
1185   if (!nmax) nmax = 1;
1186   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1187 
1188   /* Make sure every processor loops through the nstages */
1189   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1190 
1191   for (i=0,pos=0; i<nstages; i++) {
1192     if (pos+nmax <= ismax) max_no = nmax;
1193     else if (pos == ismax) max_no = 0;
1194     else                   max_no = ismax-pos;
1195     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
1196     pos += max_no;
1197   }
1198 
1199   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
1200   PetscFunctionReturn(0);
1201 }
1202 
1203 /* -------------------------------------------------------------------------*/
1204 #undef __FUNCT__
1205 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
1206 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1207 {
1208   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1209   Mat            A  = c->A;
1210   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1211   const PetscInt **icol,**irow;
1212   PetscInt       *nrow,*ncol,start;
1213   PetscErrorCode ierr;
1214   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1215   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1216   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1217   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1218   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1219 #if defined(PETSC_USE_CTABLE)
1220   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1221 #else
1222   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1223 #endif
1224   const PetscInt *irow_i;
1225   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1226   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1227   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1228   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1229   MPI_Status     *r_status3,*r_status4,*s_status4;
1230   MPI_Comm       comm;
1231   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1232   PetscMPIInt    *onodes1,*olengths1;
1233   PetscMPIInt    idex,idex2,end;
1234 
1235   PetscFunctionBegin;
1236   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1237   tag0 = ((PetscObject)C)->tag;
1238   size = c->size;
1239   rank = c->rank;
1240 
1241   /* Get some new tags to keep the communication clean */
1242   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1243   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1244   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1245 
1246   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
1247 
1248   for (i=0; i<ismax; i++) {
1249     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
1250     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
1251     if (allcolumns[i]) {
1252       icol[i] = NULL;
1253       ncol[i] = C->cmap->N;
1254     } else {
1255       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
1256       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
1257     }
1258   }
1259 
1260   /* evaluate communication - mesg to who, length of mesg, and buffer space
1261      required. Based on this, buffers are allocated, and data copied into them*/
1262   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
1263   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1264   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1265   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1266   for (i=0; i<ismax; i++) {
1267     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
1268     jmax   = nrow[i];
1269     irow_i = irow[i];
1270     for (j=0; j<jmax; j++) {
1271       l   = 0;
1272       row = irow_i[j];
1273       while (row >= C->rmap->range[l+1]) l++;
1274       proc = l;
1275       w4[proc]++;
1276     }
1277     for (j=0; j<size; j++) {
1278       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1279     }
1280   }
1281 
1282   nrqs     = 0;              /* no of outgoing messages */
1283   msz      = 0;              /* total mesg length (for all procs) */
1284   w1[rank] = 0;              /* no mesg sent to self */
1285   w3[rank] = 0;
1286   for (i=0; i<size; i++) {
1287     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1288   }
1289   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1290   for (i=0,j=0; i<size; i++) {
1291     if (w1[i]) { pa[j] = i; j++; }
1292   }
1293 
1294   /* Each message would have a header = 1 + 2*(no of IS) + data */
1295   for (i=0; i<nrqs; i++) {
1296     j      = pa[i];
1297     w1[j] += w2[j] + 2* w3[j];
1298     msz   += w1[j];
1299   }
1300   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1301 
1302   /* Determine the number of messages to expect, their lengths, from from-ids */
1303   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1304   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1305 
1306   /* Now post the Irecvs corresponding to these messages */
1307   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1308 
1309   ierr = PetscFree(onodes1);CHKERRQ(ierr);
1310   ierr = PetscFree(olengths1);CHKERRQ(ierr);
1311 
1312   /* Allocate Memory for outgoing messages */
1313   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1314   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1315   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1316 
1317   {
1318     PetscInt *iptr = tmp,ict = 0;
1319     for (i=0; i<nrqs; i++) {
1320       j        = pa[i];
1321       iptr    += ict;
1322       sbuf1[j] = iptr;
1323       ict      = w1[j];
1324     }
1325   }
1326 
1327   /* Form the outgoing messages */
1328   /* Initialize the header space */
1329   for (i=0; i<nrqs; i++) {
1330     j           = pa[i];
1331     sbuf1[j][0] = 0;
1332     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
1333     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1334   }
1335 
1336   /* Parse the isrow and copy data into outbuf */
1337   for (i=0; i<ismax; i++) {
1338     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1339     irow_i = irow[i];
1340     jmax   = nrow[i];
1341     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1342       l   = 0;
1343       row = irow_i[j];
1344       while (row >= C->rmap->range[l+1]) l++;
1345       proc = l;
1346       if (proc != rank) { /* copy to the outgoing buf*/
1347         ctr[proc]++;
1348         *ptr[proc] = row;
1349         ptr[proc]++;
1350       }
1351     }
1352     /* Update the headers for the current IS */
1353     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1354       if ((ctr_j = ctr[j])) {
1355         sbuf1_j        = sbuf1[j];
1356         k              = ++sbuf1_j[0];
1357         sbuf1_j[2*k]   = ctr_j;
1358         sbuf1_j[2*k-1] = i;
1359       }
1360     }
1361   }
1362 
1363   /*  Now  post the sends */
1364   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1365   for (i=0; i<nrqs; ++i) {
1366     j    = pa[i];
1367     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
1368   }
1369 
1370   /* Post Receives to capture the buffer size */
1371   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
1372   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
1373   rbuf2[0] = tmp + msz;
1374   for (i=1; i<nrqs; ++i) {
1375     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1376   }
1377   for (i=0; i<nrqs; ++i) {
1378     j    = pa[i];
1379     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
1380   }
1381 
1382   /* Send to other procs the buf size they should allocate */
1383 
1384 
1385   /* Receive messages*/
1386   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
1387   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1388   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
1389   {
1390     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1391     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1392     PetscInt   *sbuf2_i;
1393 
1394     for (i=0; i<nrqr; ++i) {
1395       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1396 
1397       req_size[idex] = 0;
1398       rbuf1_i        = rbuf1[idex];
1399       start          = 2*rbuf1_i[0] + 1;
1400       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1401       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
1402       sbuf2_i        = sbuf2[idex];
1403       for (j=start; j<end; j++) {
1404         id              = rbuf1_i[j] - rstart;
1405         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1406         sbuf2_i[j]      = ncols;
1407         req_size[idex] += ncols;
1408       }
1409       req_source[idex] = r_status1[i].MPI_SOURCE;
1410       /* form the header */
1411       sbuf2_i[0] = req_size[idex];
1412       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1413 
1414       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1415     }
1416   }
1417   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1418   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1419 
1420   /*  recv buffer sizes */
1421   /* Receive messages*/
1422 
1423   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1424   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1425   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1426   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1427   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1428 
1429   for (i=0; i<nrqs; ++i) {
1430     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1431     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1432     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1433     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1434     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1435   }
1436   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1437   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1438 
1439   /* Wait on sends1 and sends2 */
1440   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1441   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1442 
1443   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1444   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1445   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1446   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1447   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1448   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1449 
1450   /* Now allocate buffers for a->j, and send them off */
1451   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1452   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1453   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1454   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1455 
1456   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1457   {
1458     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1459     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1460     PetscInt cend = C->cmap->rend;
1461     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1462 
1463     for (i=0; i<nrqr; i++) {
1464       rbuf1_i   = rbuf1[i];
1465       sbuf_aj_i = sbuf_aj[i];
1466       ct1       = 2*rbuf1_i[0] + 1;
1467       ct2       = 0;
1468       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1469         kmax = rbuf1[i][2*j];
1470         for (k=0; k<kmax; k++,ct1++) {
1471           row    = rbuf1_i[ct1] - rstart;
1472           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1473           ncols  = nzA + nzB;
1474           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1475 
1476           /* load the column indices for this row into cols*/
1477           cols = sbuf_aj_i + ct2;
1478 
1479           lwrite = 0;
1480           for (l=0; l<nzB; l++) {
1481             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1482           }
1483           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1484           for (l=0; l<nzB; l++) {
1485             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1486           }
1487 
1488           ct2 += ncols;
1489         }
1490       }
1491       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1492     }
1493   }
1494   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1495   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1496 
1497   /* Allocate buffers for a->a, and send them off */
1498   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1499   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1500   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1501   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1502 
1503   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1504   {
1505     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1506     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1507     PetscInt    cend   = C->cmap->rend;
1508     PetscInt    *b_j   = b->j;
1509     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1510 
1511     for (i=0; i<nrqr; i++) {
1512       rbuf1_i   = rbuf1[i];
1513       sbuf_aa_i = sbuf_aa[i];
1514       ct1       = 2*rbuf1_i[0]+1;
1515       ct2       = 0;
1516       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1517         kmax = rbuf1_i[2*j];
1518         for (k=0; k<kmax; k++,ct1++) {
1519           row    = rbuf1_i[ct1] - rstart;
1520           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1521           ncols  = nzA + nzB;
1522           cworkB = b_j + b_i[row];
1523           vworkA = a_a + a_i[row];
1524           vworkB = b_a + b_i[row];
1525 
1526           /* load the column values for this row into vals*/
1527           vals = sbuf_aa_i+ct2;
1528 
1529           lwrite = 0;
1530           for (l=0; l<nzB; l++) {
1531             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1532           }
1533           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1534           for (l=0; l<nzB; l++) {
1535             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1536           }
1537 
1538           ct2 += ncols;
1539         }
1540       }
1541       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1542     }
1543   }
1544   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1545   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1546   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1547   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1548 
1549   /* Form the matrix */
1550   /* create col map: global col of C -> local col of submatrices */
1551   {
1552     const PetscInt *icol_i;
1553 #if defined(PETSC_USE_CTABLE)
1554     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1555     for (i=0; i<ismax; i++) {
1556       if (!allcolumns[i]) {
1557         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1558 
1559         jmax   = ncol[i];
1560         icol_i = icol[i];
1561         cmap_i = cmap[i];
1562         for (j=0; j<jmax; j++) {
1563           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1564         }
1565       } else {
1566         cmap[i] = NULL;
1567       }
1568     }
1569 #else
1570     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1571     for (i=0; i<ismax; i++) {
1572       if (!allcolumns[i]) {
1573         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1574         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1575         jmax   = ncol[i];
1576         icol_i = icol[i];
1577         cmap_i = cmap[i];
1578         for (j=0; j<jmax; j++) {
1579           cmap_i[icol_i[j]] = j+1;
1580         }
1581       } else {
1582         cmap[i] = NULL;
1583       }
1584     }
1585 #endif
1586   }
1587 
1588   /* Create lens which is required for MatCreate... */
1589   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1590   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1591   if (ismax) {
1592     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1593     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1594   }
1595   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1596 
1597   /* Update lens from local data */
1598   for (i=0; i<ismax; i++) {
1599     jmax = nrow[i];
1600     if (!allcolumns[i]) cmap_i = cmap[i];
1601     irow_i = irow[i];
1602     lens_i = lens[i];
1603     for (j=0; j<jmax; j++) {
1604       l   = 0;
1605       row = irow_i[j];
1606       while (row >= C->rmap->range[l+1]) l++;
1607       proc = l;
1608       if (proc == rank) {
1609         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1610         if (!allcolumns[i]) {
1611           for (k=0; k<ncols; k++) {
1612 #if defined(PETSC_USE_CTABLE)
1613             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1614 #else
1615             tcol = cmap_i[cols[k]];
1616 #endif
1617             if (tcol) lens_i[j]++;
1618           }
1619         } else { /* allcolumns */
1620           lens_i[j] = ncols;
1621         }
1622         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1623       }
1624     }
1625   }
1626 
1627   /* Create row map: global row of C -> local row of submatrices */
1628 #if defined(PETSC_USE_CTABLE)
1629   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1630   for (i=0; i<ismax; i++) {
1631     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1632     rmap_i = rmap[i];
1633     irow_i = irow[i];
1634     jmax   = nrow[i];
1635     for (j=0; j<jmax; j++) {
1636       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1637     }
1638   }
1639 #else
1640   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1641   if (ismax) {
1642     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1643     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1644   }
1645   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1646   for (i=0; i<ismax; i++) {
1647     rmap_i = rmap[i];
1648     irow_i = irow[i];
1649     jmax   = nrow[i];
1650     for (j=0; j<jmax; j++) {
1651       rmap_i[irow_i[j]] = j;
1652     }
1653   }
1654 #endif
1655 
1656   /* Update lens from offproc data */
1657   {
1658     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1659 
1660     for (tmp2=0; tmp2<nrqs; tmp2++) {
1661       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1662       idex    = pa[idex2];
1663       sbuf1_i = sbuf1[idex];
1664       jmax    = sbuf1_i[0];
1665       ct1     = 2*jmax+1;
1666       ct2     = 0;
1667       rbuf2_i = rbuf2[idex2];
1668       rbuf3_i = rbuf3[idex2];
1669       for (j=1; j<=jmax; j++) {
1670         is_no  = sbuf1_i[2*j-1];
1671         max1   = sbuf1_i[2*j];
1672         lens_i = lens[is_no];
1673         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1674         rmap_i = rmap[is_no];
1675         for (k=0; k<max1; k++,ct1++) {
1676 #if defined(PETSC_USE_CTABLE)
1677           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1678           row--;
1679           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1680 #else
1681           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1682 #endif
1683           max2 = rbuf2_i[ct1];
1684           for (l=0; l<max2; l++,ct2++) {
1685             if (!allcolumns[is_no]) {
1686 #if defined(PETSC_USE_CTABLE)
1687               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1688 #else
1689               tcol = cmap_i[rbuf3_i[ct2]];
1690 #endif
1691               if (tcol) lens_i[row]++;
1692             } else { /* allcolumns */
1693               lens_i[row]++; /* lens_i[row] += max2 ? */
1694             }
1695           }
1696         }
1697       }
1698     }
1699   }
1700   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1701   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1702   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1703   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1704   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1705 
1706   /* Create the submatrices */
1707   if (scall == MAT_REUSE_MATRIX) {
1708     PetscBool flag;
1709 
1710     /*
1711         Assumes new rows are same length as the old rows,hence bug!
1712     */
1713     for (i=0; i<ismax; i++) {
1714       mat = (Mat_SeqAIJ*)(submats[i]->data);
1715       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1716       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1717       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1718       /* Initial matrix as if empty */
1719       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1720 
1721       submats[i]->factortype = C->factortype;
1722     }
1723   } else {
1724     for (i=0; i<ismax; i++) {
1725       PetscInt rbs,cbs;
1726       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1727       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1728 
1729       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1730       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1731 
1732       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1733       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1734       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1735     }
1736   }
1737 
1738   /* Assemble the matrices */
1739   /* First assemble the local rows */
1740   {
1741     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1742     PetscScalar *imat_a;
1743 
1744     for (i=0; i<ismax; i++) {
1745       mat       = (Mat_SeqAIJ*)submats[i]->data;
1746       imat_ilen = mat->ilen;
1747       imat_j    = mat->j;
1748       imat_i    = mat->i;
1749       imat_a    = mat->a;
1750 
1751       if (!allcolumns[i]) cmap_i = cmap[i];
1752       rmap_i = rmap[i];
1753       irow_i = irow[i];
1754       jmax   = nrow[i];
1755       for (j=0; j<jmax; j++) {
1756         l   = 0;
1757         row = irow_i[j];
1758         while (row >= C->rmap->range[l+1]) l++;
1759         proc = l;
1760         if (proc == rank) {
1761           old_row = row;
1762 #if defined(PETSC_USE_CTABLE)
1763           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1764           row--;
1765 #else
1766           row = rmap_i[row];
1767 #endif
1768           ilen_row = imat_ilen[row];
1769           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1770           mat_i    = imat_i[row];
1771           mat_a    = imat_a + mat_i;
1772           mat_j    = imat_j + mat_i;
1773           if (!allcolumns[i]) {
1774             for (k=0; k<ncols; k++) {
1775 #if defined(PETSC_USE_CTABLE)
1776               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1777 #else
1778               tcol = cmap_i[cols[k]];
1779 #endif
1780               if (tcol) {
1781                 *mat_j++ = tcol - 1;
1782                 *mat_a++ = vals[k];
1783                 ilen_row++;
1784               }
1785             }
1786           } else { /* allcolumns */
1787             for (k=0; k<ncols; k++) {
1788               *mat_j++ = cols[k];  /* global col index! */
1789               *mat_a++ = vals[k];
1790               ilen_row++;
1791             }
1792           }
1793           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1794 
1795           imat_ilen[row] = ilen_row;
1796         }
1797       }
1798     }
1799   }
1800 
1801   /*   Now assemble the off proc rows*/
1802   {
1803     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1804     PetscInt    *imat_j,*imat_i;
1805     PetscScalar *imat_a,*rbuf4_i;
1806 
1807     for (tmp2=0; tmp2<nrqs; tmp2++) {
1808       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1809       idex    = pa[idex2];
1810       sbuf1_i = sbuf1[idex];
1811       jmax    = sbuf1_i[0];
1812       ct1     = 2*jmax + 1;
1813       ct2     = 0;
1814       rbuf2_i = rbuf2[idex2];
1815       rbuf3_i = rbuf3[idex2];
1816       rbuf4_i = rbuf4[idex2];
1817       for (j=1; j<=jmax; j++) {
1818         is_no     = sbuf1_i[2*j-1];
1819         rmap_i    = rmap[is_no];
1820         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1821         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1822         imat_ilen = mat->ilen;
1823         imat_j    = mat->j;
1824         imat_i    = mat->i;
1825         imat_a    = mat->a;
1826         max1      = sbuf1_i[2*j];
1827         for (k=0; k<max1; k++,ct1++) {
1828           row = sbuf1_i[ct1];
1829 #if defined(PETSC_USE_CTABLE)
1830           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1831           row--;
1832 #else
1833           row = rmap_i[row];
1834 #endif
1835           ilen  = imat_ilen[row];
1836           mat_i = imat_i[row];
1837           mat_a = imat_a + mat_i;
1838           mat_j = imat_j + mat_i;
1839           max2  = rbuf2_i[ct1];
1840           if (!allcolumns[is_no]) {
1841             for (l=0; l<max2; l++,ct2++) {
1842 
1843 #if defined(PETSC_USE_CTABLE)
1844               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1845 #else
1846               tcol = cmap_i[rbuf3_i[ct2]];
1847 #endif
1848               if (tcol) {
1849                 *mat_j++ = tcol - 1;
1850                 *mat_a++ = rbuf4_i[ct2];
1851                 ilen++;
1852               }
1853             }
1854           } else { /* allcolumns */
1855             for (l=0; l<max2; l++,ct2++) {
1856               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1857               *mat_a++ = rbuf4_i[ct2];
1858               ilen++;
1859             }
1860           }
1861           imat_ilen[row] = ilen;
1862         }
1863       }
1864     }
1865   }
1866 
1867   /* sort the rows */
1868   {
1869     PetscInt    *imat_ilen,*imat_j,*imat_i;
1870     PetscScalar *imat_a;
1871 
1872     for (i=0; i<ismax; i++) {
1873       mat       = (Mat_SeqAIJ*)submats[i]->data;
1874       imat_j    = mat->j;
1875       imat_i    = mat->i;
1876       imat_a    = mat->a;
1877       imat_ilen = mat->ilen;
1878 
1879       if (allcolumns[i]) continue;
1880       jmax = nrow[i];
1881       for (j=0; j<jmax; j++) {
1882         PetscInt ilen;
1883 
1884         mat_i = imat_i[j];
1885         mat_a = imat_a + mat_i;
1886         mat_j = imat_j + mat_i;
1887         ilen  = imat_ilen[j];
1888         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
1889       }
1890     }
1891   }
1892 
1893   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1894   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1895   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1896   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1897   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1898 
1899   /* Restore the indices */
1900   for (i=0; i<ismax; i++) {
1901     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1902     if (!allcolumns[i]) {
1903       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1904     }
1905   }
1906 
1907   /* Destroy allocated memory */
1908   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1909   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1910   ierr = PetscFree(pa);CHKERRQ(ierr);
1911 
1912   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1913   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1914   for (i=0; i<nrqr; ++i) {
1915     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1916   }
1917   for (i=0; i<nrqs; ++i) {
1918     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1919     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1920   }
1921 
1922   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1923   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1924   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1925   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1926   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1927   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1928   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1929 
1930 #if defined(PETSC_USE_CTABLE)
1931   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1932 #else
1933   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1934 #endif
1935   ierr = PetscFree(rmap);CHKERRQ(ierr);
1936 
1937   for (i=0; i<ismax; i++) {
1938     if (!allcolumns[i]) {
1939 #if defined(PETSC_USE_CTABLE)
1940       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1941 #else
1942       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1943 #endif
1944     }
1945   }
1946   ierr = PetscFree(cmap);CHKERRQ(ierr);
1947   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1948   ierr = PetscFree(lens);CHKERRQ(ierr);
1949 
1950   for (i=0; i<ismax; i++) {
1951     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1952     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1953   }
1954   PetscFunctionReturn(0);
1955 }
1956 
1957 /*
1958  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1959  Be careful not to destroy them elsewhere.
1960  */
1961 #undef __FUNCT__
1962 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1963 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1964 {
1965   /* If making this function public, change the error returned in this function away from _PLIB. */
1966   PetscErrorCode ierr;
1967   Mat_MPIAIJ     *aij;
1968   PetscBool      seqaij;
1969 
1970   PetscFunctionBegin;
1971   /* Check to make sure the component matrices are compatible with C. */
1972   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1973   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1974   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1975   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1976   if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1977 
1978   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1979   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1980   ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr);
1981   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1982   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1983   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1984   ierr   = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1985   aij    = (Mat_MPIAIJ*)((*C)->data);
1986   aij->A = A;
1987   aij->B = B;
1988   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr);
1989   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr);
1990 
1991   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1992   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1993   PetscFunctionReturn(0);
1994 }
1995 
1996 #undef __FUNCT__
1997 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1998 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1999 {
2000   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
2001 
2002   PetscFunctionBegin;
2003   PetscValidPointer(A,2);
2004   PetscValidPointer(B,3);
2005   *A = aij->A;
2006   *B = aij->B;
2007   /* Note that we don't incref *A and *B, so be careful! */
2008   PetscFunctionReturn(0);
2009 }
2010 
2011 #undef __FUNCT__
2012 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
2013 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
2014                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
2015                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
2016                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
2017 {
2018   PetscErrorCode ierr;
2019   PetscMPIInt    size, flag;
2020   PetscInt       i,ii;
2021   PetscInt       ismax_c;
2022 
2023   PetscFunctionBegin;
2024   if (!ismax) PetscFunctionReturn(0);
2025 
2026   for (i = 0, ismax_c = 0; i < ismax; ++i) {
2027     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
2028     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2029     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2030     if (size > 1) ++ismax_c;
2031   }
2032   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
2033     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2034   } else { /* if (ismax_c) */
2035     Mat         *A,*B;
2036     IS          *isrow_c, *iscol_c;
2037     PetscMPIInt size;
2038     /*
2039      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2040      array of sequential matrices underlying the resulting parallel matrices.
2041      Which arrays to allocate is based on the value of MatReuse scall.
2042      There are as many diag matrices as there are original index sets.
2043      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
2044 
2045      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
2046      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
2047      will deposite the extracted diag and off-diag parts.
2048      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
2049      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
2050     */
2051 
2052     /* Parallel matrix array is allocated only if no reuse is taking place. */
2053     if (scall != MAT_REUSE_MATRIX) {
2054       ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
2055     } else {
2056       ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr);
2057       ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr);
2058       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
2059       for (i = 0, ii = 0; i < ismax; ++i) {
2060         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2061         if (size > 1) {
2062           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
2063           ++ii;
2064         } else A[i] = (*submat)[i];
2065       }
2066     }
2067     /*
2068      Construct the complements of the iscol ISs for parallel ISs only.
2069      These are used to extract the off-diag portion of the resulting parallel matrix.
2070      The row IS for the off-diag portion is the same as for the diag portion,
2071      so we merely alias the row IS, while skipping those that are sequential.
2072     */
2073     ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr);
2074     for (i = 0, ii = 0; i < ismax; ++i) {
2075       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2076       if (size > 1) {
2077         isrow_c[ii] = isrow[i];
2078 
2079         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
2080         ++ii;
2081       }
2082     }
2083     /* Now obtain the sequential A and B submatrices separately. */
2084     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
2085     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
2086     for (ii = 0; ii < ismax_c; ++ii) {
2087       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
2088     }
2089     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
2090     /*
2091      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
2092      have been extracted directly into the parallel matrices containing them, or
2093      simply into the sequential matrix identical with the corresponding A (if size == 1).
2094      Otherwise, make sure that parallel matrices are constructed from A & B, or the
2095      A is put into the correct submat slot (if size == 1).
2096      */
2097     if (scall != MAT_REUSE_MATRIX) {
2098       for (i = 0, ii = 0; i < ismax; ++i) {
2099         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2100         if (size > 1) {
2101           /*
2102            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
2103            */
2104           /* Construct submat[i] from the Seq pieces A and B. */
2105           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
2106 
2107           ++ii;
2108         } else (*submat)[i] = A[i];
2109       }
2110     }
2111     ierr = PetscFree(A);CHKERRQ(ierr);
2112     ierr = PetscFree(B);CHKERRQ(ierr);
2113   }
2114   PetscFunctionReturn(0);
2115 } /* MatGetSubMatricesParallel_MPIXAIJ() */
2116 
2117 
2118 
2119 #undef __FUNCT__
2120 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
2121 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2122 {
2123   PetscErrorCode ierr;
2124 
2125   PetscFunctionBegin;
2126   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
2127   PetscFunctionReturn(0);
2128 }
2129