xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision f1c23e0e28375b6cfb403ac12250cd2b606dd9bd)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_K(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_K(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_K(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_K(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscBool      mat_increase_overlap_k,set;
28   PetscInt       i;
29 
30   PetscFunctionBegin;
31   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
32   ierr = PetscOptionsGetBool(NULL,"-mat_increase_overlap_k",&mat_increase_overlap_k,&set);CHKERRQ(ierr);
33   if(!set) mat_increase_overlap_k = PETSC_FALSE;
34   for (i=0; i<ov; ++i) {
35     if(mat_increase_overlap_k){
36       ierr = MatIncreaseOverlap_MPIAIJ_Once_K(C,imax,is);CHKERRQ(ierr);
37     }else{
38       ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
39     }
40   }
41   PetscFunctionReturn(0);
42 }
43 
44 
45 #undef __FUNCT__
46 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_K"
47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_K(Mat mat,PetscInt nidx,IS is[])
48 {
49   PetscErrorCode   ierr;
50   MPI_Comm         comm;
51   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
52   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
53   PetscInt         nrecvrows,*sbsizes,*sbdata,nto,nfrom;
54   const PetscInt **indices,*indices_i;
55   PetscLayout      rmap;
56   PetscMPIInt      rank,size,*toranks,*fromranks;
57   PetscSF          sf;
58   PetscSFNode     *remote;
59 
60   PetscFunctionBegin;
61   /*communicator */
62   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
63   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
64   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
65   /*row map*/
66   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
67   /*retrieve IS data*/
68   ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr);
69   /*get length and indices*/
70   for(i=0,tlength=0; i<nidx; i++){
71     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
72     tlength += length[i];
73     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
74   }
75   /*find these rows on remote processors */
76   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
77   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
78   nrrows = 0;
79   for(i=0; i<nidx; i++){
80 	length_i     = length[i];
81 	indices_i    = indices[i];
82     for(j=0; j<length_i; j++){
83       owner = -1;
84       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
85       /*remote processors*/
86       if(owner != rank){
87     	tosizes_temp[owner]++; /*number of rows to owner*/
88     	rrow_ranks[nrrows]  = owner; /*processor */
89     	rrow_isids[nrrows]   = i; /*is id*/
90         remoterows[nrrows++] = indices_i[j]; /* row */
91       }
92     }
93     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
94   }
95   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
96   /*test if we need to exchange messages
97    * generally speaking, we do not need to exchange
98    * data when overlap is 1
99    * */
100   ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
101   /*we do not have any messages
102    * It usually corresponds to overlap 1
103    * */
104   if(!reducednrrows){
105 	ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
106 	ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
107 	ierr = MatIncreaseOverlap_MPIAIJ_Local_K(mat,nidx,is);CHKERRQ(ierr);
108 	PetscFunctionReturn(0);
109   }
110   nto = 0;
111   /*send sizes and ranks*/
112   for(i=0; i<size; i++){
113    if(tosizes_temp[i]){
114     tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
115     tosizes_temp[i] = nto; /* a map from processor to index */
116 	toranks[nto++]  = i; /* processor */
117    }
118   }
119   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
120   for(i=0; i<nto; i++){
121 	toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /*offsets*/
122 	tosizes[2*i+1] = toffsets[i]; /*offsets to send*/
123   }
124   /*send information to other processors*/
125   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
126   /*build a star forest */
127   nrecvrows = 0;
128   for(i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
130   nrecvrows = 0;
131   for(i=0; i<nfrom; i++){
132 	for(j=0; j<fromsizes[2*i]; j++){
133 	  remote[nrecvrows].rank    = fromranks[i];
134       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135 	}
136   }
137   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
138   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
139   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
140   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
141   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
142   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
143   /*message pair <no of is, row> */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for(i=0; i<nrrows; i++){
146 	owner = rrow_ranks[i]; /* processor */
147 	j     = tosizes_temp[owner]; /* index */
148 	todata[toffsets[j]++] = rrow_isids[i];
149 	todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /*deal with remote data */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_K(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for(i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for(i=0; i<nto; i++){
169   	for(j=0; j<tosizes[2*i]; j++){
170   	  remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172   	}
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /*overlap communication and computation*/
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_K(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
184   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
185   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
186   ierr = MatIncreaseOverlap_MPIAIJ_Receive_K(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
187   ierr = PetscFree(toranks);CHKERRQ(ierr);
188   ierr = PetscFree(tosizes);CHKERRQ(ierr);
189   ierr = PetscFree(todata);CHKERRQ(ierr);
190   PetscFunctionReturn(0);
191 }
192 
193 #undef __FUNCT__
194 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_K"
195 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_K(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
196 {
197   PetscInt         *isz,isz_i,i,j,is_id, data_size;
198   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
199   const PetscInt   *indices_i_temp;
200   PetscErrorCode    ierr;
201 
202   PetscFunctionBegin;
203   max_lsize = 0;
204   ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr);
205   for(i=0; i<nidx; i++){
206    ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
207    max_lsize = lsize>max_lsize ? lsize:max_lsize;
208    isz[i]    = lsize;
209   }
210   ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr);
211   for(i=0; i<nidx; i++){
212     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213 	ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
214 	ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
215 	ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
216   }
217   /*retrieve information */
218   for(i=0; i<nrecvs; ){
219 	is_id      = recvdata[i++];
220 	data_size  = recvdata[i++];
221 	indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
222 	isz_i      = isz[is_id];
223     for(j=0; j< data_size; j++){
224       col = recvdata[i++];
225       indices_i[isz_i++] = col;
226     }
227     isz[is_id] = isz_i;
228   }
229   /*remove duplicate entities*/
230   for(i=0; i<nidx; i++){
231    indices_i  = indices_temp+(max_lsize+nrecvs)*i;
232    isz_i      = isz[i];
233    ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
234    ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
235   }
236   ierr = PetscFree(isz);CHKERRQ(ierr);
237   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 #undef __FUNCT__
242 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_K"
243 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_K(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
244 {
245   PetscLayout       rmap,cmap;
246   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
247   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
248   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
249   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
250   Mat               amat,bmat;
251   PetscMPIInt       rank;
252   PetscBool         done;
253   MPI_Comm          comm;
254   PetscErrorCode    ierr;
255 
256   PetscFunctionBegin;
257   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
258   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
259   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
260   /* Even if the mat is symmetric, we still assume it is not symmetric*/
261   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
262   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
263   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
264   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
265   /*total number of nonzero values */
266   tnz  = ai[an]+bi[bn];
267   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
268   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
269   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
270   /*longest message */
271   max_fszs = 0;
272   for(i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
273   /*better way to estimate number of nonzero in the mat???*/
274   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
275   for(i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
276   rows_pos  = 0;
277   totalrows = 0;
278   for(i=0; i<nfrom; i++){
279 	ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
280 	/*group data*/
281     for(j=0; j<fromsizes[2*i]; j+=2){
282       is_id                       = fromrows[rows_pos++];/*no of is*/
283       rows_i                      = rows_data[is_id];
284       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
285     }
286     /*estimate a space to avoid multiple allocations  */
287     for(j=0; j<nidx; j++){
288       indvc_ij = 0;
289       rows_i   = rows_data[j];
290       for(l=0; l<rows_pos_i[j]; l++){
291         row    = rows_i[l]-rstart;
292         start  = ai[row];
293         end    = ai[row+1];
294         for(k=start; k<end; k++){ /* Amat */
295           col = aj[k] + cstart;
296           /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr);
297           if(owner != fromranks[i])*/
298           indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/
299         }
300         start = bi[row];
301         end   = bi[row+1];
302         for(k=start; k<end; k++) { /* Bmat */
303           col = gcols[bj[k]];
304           /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr);
305           if(owner != fromranks[i]) */
306           indices_tmp[indvc_ij++] = col;
307         }
308       }
309       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
310       indv_counts[i*nidx+j] = indvc_ij;
311       totalrows            += indvc_ij;
312     }
313   }
314   /*message triple <no of is, number of rows, rows> */
315   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
316   totalrows = 0;
317   rows_pos  = 0;
318   /* use this code again */
319   for(i=0;i<nfrom;i++){
320   	ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
321     for(j=0; j<fromsizes[2*i]; j+=2){
322       is_id                       = fromrows[rows_pos++];
323       rows_i                      = rows_data[is_id];
324       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
325     }
326     /* add data  */
327     for(j=0; j<nidx; j++){
328       if(!indv_counts[i*nidx+j]) continue;
329       indvc_ij = 0;
330       sbdata[totalrows++] = j;
331       sbdata[totalrows++] = indv_counts[i*nidx+j];
332       sbsizes[2*i]       += 2;
333       rows_i              = rows_data[j];
334       for(l=0; l<rows_pos_i[j]; l++){
335         row   = rows_i[l]-rstart;
336         start = ai[row];
337         end   = ai[row+1];
338         for(k=start; k<end; k++){ /* Amat */
339           col = aj[k] + cstart;
340           indices_tmp[indvc_ij++] = col;
341         }
342         start = bi[row];
343         end   = bi[row+1];
344         for(k=start; k<end; k++) { /* Bmat */
345           col = gcols[bj[k]];
346           indices_tmp[indvc_ij++] = col;
347         }
348       }
349       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
350       sbsizes[2*i]  += indvc_ij;
351       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
352       totalrows += indvc_ij;
353     }
354   }
355   /* offsets */
356   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
357   for(i=0; i<nfrom; i++){
358 	offsets[i+1]   = offsets[i] + sbsizes[2*i];
359 	sbsizes[2*i+1] = offsets[i];
360   }
361   ierr = PetscFree(offsets);CHKERRQ(ierr);
362   if(sbrowsizes) *sbrowsizes = sbsizes;
363   if(sbrows) *sbrows = sbdata;
364   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
365   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_K"
372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_K(Mat mat,PetscInt nidx, IS is[])
373 {
374   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
375   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
376   PetscInt          lsize,lsize_tmp,owner;
377   PetscMPIInt       rank;
378   Mat   	        amat,bmat;
379   PetscBool         done;
380   PetscLayout       cmap,rmap;
381   MPI_Comm          comm;
382   PetscErrorCode    ierr;
383 
384   PetscFunctionBegin;
385   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
386   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
387   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
388   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
389   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
390   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
391   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
392   /*is it a safe way to compute number of nonzero values ?*/
393   tnz  = ai[an]+bi[bn];
394   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
395   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
397   /*it is a better way to estimate memory than the old implementation
398    * where global size of matrix is used
399    * */
400   ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr);
401   for(i=0; i<nidx; i++) {
402 	ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
403 	ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
404 	lsize_tmp = 0;
405     for(j=0; j<lsize; j++) {
406        owner = -1;
407        row   = indices[j];
408        ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
409        if(owner != rank) continue;
410        /*local number*/
411        row  -= rstart;
412        start = ai[row];
413        end   = ai[row+1];
414        for(k=start; k<end; k++) { /* Amat */
415          col = aj[k] + cstart;
416          indices_temp[lsize_tmp++] = col;
417        }
418        start = bi[row];
419        end   = bi[row+1];
420        for(k=start; k<end; k++) { /* Bmat */
421          col = gcols[bj[k]];
422          indices_temp[lsize_tmp++] = col;
423        }
424     }
425    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
426    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
427    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
428    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
429   }
430   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
431   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 
436 
437 /*
438   Sample message format:
439   If a processor A wants processor B to process some elements corresponding
440   to index sets is[1],is[5]
441   mesg [0] = 2   (no of index sets in the mesg)
442   -----------
443   mesg [1] = 1 => is[1]
444   mesg [2] = sizeof(is[1]);
445   -----------
446   mesg [3] = 5  => is[5]
447   mesg [4] = sizeof(is[5]);
448   -----------
449   mesg [5]
450   mesg [n]  datas[1]
451   -----------
452   mesg[n+1]
453   mesg[m]  data(is[5])
454   -----------
455 
456   Notes:
457   nrqs - no of requests sent (or to be sent out)
458   nrqr - no of requests recieved (which have to be or which have been processed
459 */
460 #undef __FUNCT__
461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
463 {
464   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
465   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
466   const PetscInt **idx,*idx_i;
467   PetscInt       *n,**data,len;
468   PetscErrorCode ierr;
469   PetscMPIInt    size,rank,tag1,tag2;
470   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
471   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
472   PetscBT        *table;
473   MPI_Comm       comm;
474   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
475   MPI_Status     *s_status,*recv_status;
476   char           *t_p;
477 
478   PetscFunctionBegin;
479   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
480   size = c->size;
481   rank = c->rank;
482   M    = C->rmap->N;
483 
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
485   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
486 
487   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
488 
489   for (i=0; i<imax; i++) {
490     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
491     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
492   }
493 
494   /* evaluate communication - mesg to who,length of mesg, and buffer space
495      required. Based on this, buffers are allocated, and data copied into them*/
496   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
497   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
498   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
499   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
500   for (i=0; i<imax; i++) {
501     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
502     idx_i = idx[i];
503     len   = n[i];
504     for (j=0; j<len; j++) {
505       row = idx_i[j];
506       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
507       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
508       w4[proc]++;
509     }
510     for (j=0; j<size; j++) {
511       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
512     }
513   }
514 
515   nrqs     = 0;              /* no of outgoing messages */
516   msz      = 0;              /* total mesg length (for all proc */
517   w1[rank] = 0;              /* no mesg sent to intself */
518   w3[rank] = 0;
519   for (i=0; i<size; i++) {
520     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
521   }
522   /* pa - is list of processors to communicate with */
523   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
524   for (i=0,j=0; i<size; i++) {
525     if (w1[i]) {pa[j] = i; j++;}
526   }
527 
528   /* Each message would have a header = 1 + 2*(no of IS) + data */
529   for (i=0; i<nrqs; i++) {
530     j      = pa[i];
531     w1[j] += w2[j] + 2*w3[j];
532     msz   += w1[j];
533   }
534 
535   /* Determine the number of messages to expect, their lengths, from from-ids */
536   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
537   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
538 
539   /* Now post the Irecvs corresponding to these messages */
540   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
541 
542   /* Allocate Memory for outgoing messages */
543   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
544   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
545   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
546 
547   {
548     PetscInt *iptr = tmp,ict  = 0;
549     for (i=0; i<nrqs; i++) {
550       j         = pa[i];
551       iptr     +=  ict;
552       outdat[j] = iptr;
553       ict       = w1[j];
554     }
555   }
556 
557   /* Form the outgoing messages */
558   /*plug in the headers*/
559   for (i=0; i<nrqs; i++) {
560     j            = pa[i];
561     outdat[j][0] = 0;
562     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
563     ptr[j]       = outdat[j] + 2*w3[j] + 1;
564   }
565 
566   /* Memory for doing local proc's work*/
567   {
568     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
569 
570     for (i=0; i<imax; i++) {
571       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
572       data[i]  = d_p + M*i;
573     }
574   }
575 
576   /* Parse the IS and update local tables and the outgoing buf with the data*/
577   {
578     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
579     PetscBT  table_i;
580 
581     for (i=0; i<imax; i++) {
582       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
583       n_i     = n[i];
584       table_i = table[i];
585       idx_i   = idx[i];
586       data_i  = data[i];
587       isz_i   = isz[i];
588       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
589         row  = idx_i[j];
590         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
591         if (proc != rank) { /* copy to the outgoing buffer */
592           ctr[proc]++;
593           *ptr[proc] = row;
594           ptr[proc]++;
595         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
596       }
597       /* Update the headers for the current IS */
598       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
599         if ((ctr_j = ctr[j])) {
600           outdat_j        = outdat[j];
601           k               = ++outdat_j[0];
602           outdat_j[2*k]   = ctr_j;
603           outdat_j[2*k-1] = i;
604         }
605       }
606       isz[i] = isz_i;
607     }
608   }
609 
610   /*  Now  post the sends */
611   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
612   for (i=0; i<nrqs; ++i) {
613     j    = pa[i];
614     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
615   }
616 
617   /* No longer need the original indices*/
618   for (i=0; i<imax; ++i) {
619     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
620   }
621   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
622 
623   for (i=0; i<imax; ++i) {
624     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
625   }
626 
627   /* Do Local work*/
628   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
629 
630   /* Receive messages*/
631   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
632   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
633 
634   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
635   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
636 
637   /* Phase 1 sends are complete - deallocate buffers */
638   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
639   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
640 
641   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
642   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
643   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
644   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
645   ierr = PetscFree(rbuf);CHKERRQ(ierr);
646 
647 
648   /* Send the data back*/
649   /* Do a global reduction to know the buffer space req for incoming messages*/
650   {
651     PetscMPIInt *rw1;
652 
653     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
654 
655     for (i=0; i<nrqr; ++i) {
656       proc = recv_status[i].MPI_SOURCE;
657 
658       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
659       rw1[proc] = isz1[i];
660     }
661     ierr = PetscFree(onodes1);CHKERRQ(ierr);
662     ierr = PetscFree(olengths1);CHKERRQ(ierr);
663 
664     /* Determine the number of messages to expect, their lengths, from from-ids */
665     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
666     ierr = PetscFree(rw1);CHKERRQ(ierr);
667   }
668   /* Now post the Irecvs corresponding to these messages */
669   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
670 
671   /*  Now  post the sends */
672   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
673   for (i=0; i<nrqr; ++i) {
674     j    = recv_status[i].MPI_SOURCE;
675     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
676   }
677 
678   /* receive work done on other processors*/
679   {
680     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
681     PetscMPIInt idex;
682     PetscBT     table_i;
683     MPI_Status  *status2;
684 
685     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
686     for (i=0; i<nrqs; ++i) {
687       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
688       /* Process the message*/
689       rbuf2_i = rbuf2[idex];
690       ct1     = 2*rbuf2_i[0]+1;
691       jmax    = rbuf2[idex][0];
692       for (j=1; j<=jmax; j++) {
693         max     = rbuf2_i[2*j];
694         is_no   = rbuf2_i[2*j-1];
695         isz_i   = isz[is_no];
696         data_i  = data[is_no];
697         table_i = table[is_no];
698         for (k=0; k<max; k++,ct1++) {
699           row = rbuf2_i[ct1];
700           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
701         }
702         isz[is_no] = isz_i;
703       }
704     }
705 
706     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
707     ierr = PetscFree(status2);CHKERRQ(ierr);
708   }
709 
710   for (i=0; i<imax; ++i) {
711     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
712   }
713 
714   ierr = PetscFree(onodes2);CHKERRQ(ierr);
715   ierr = PetscFree(olengths2);CHKERRQ(ierr);
716 
717   ierr = PetscFree(pa);CHKERRQ(ierr);
718   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
719   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
720   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
721   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
722   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
723   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
724   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
725   ierr = PetscFree(s_status);CHKERRQ(ierr);
726   ierr = PetscFree(recv_status);CHKERRQ(ierr);
727   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
728   ierr = PetscFree(xdata);CHKERRQ(ierr);
729   ierr = PetscFree(isz1);CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 #undef __FUNCT__
734 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
735 /*
736    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
737        the work on the local processor.
738 
739      Inputs:
740       C      - MAT_MPIAIJ;
741       imax - total no of index sets processed at a time;
742       table  - an array of char - size = m bits.
743 
744      Output:
745       isz    - array containing the count of the solution elements corresponding
746                to each index set;
747       data   - pointer to the solutions
748 */
749 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
750 {
751   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
752   Mat        A  = c->A,B = c->B;
753   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
754   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
755   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
756   PetscBT    table_i;
757 
758   PetscFunctionBegin;
759   rstart = C->rmap->rstart;
760   cstart = C->cmap->rstart;
761   ai     = a->i;
762   aj     = a->j;
763   bi     = b->i;
764   bj     = b->j;
765   garray = c->garray;
766 
767 
768   for (i=0; i<imax; i++) {
769     data_i  = data[i];
770     table_i = table[i];
771     isz_i   = isz[i];
772     for (j=0,max=isz[i]; j<max; j++) {
773       row   = data_i[j] - rstart;
774       start = ai[row];
775       end   = ai[row+1];
776       for (k=start; k<end; k++) { /* Amat */
777         val = aj[k] + cstart;
778         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
779       }
780       start = bi[row];
781       end   = bi[row+1];
782       for (k=start; k<end; k++) { /* Bmat */
783         val = garray[bj[k]];
784         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
785       }
786     }
787     isz[i] = isz_i;
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 #undef __FUNCT__
793 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
794 /*
795       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
796          and return the output
797 
798          Input:
799            C    - the matrix
800            nrqr - no of messages being processed.
801            rbuf - an array of pointers to the recieved requests
802 
803          Output:
804            xdata - array of messages to be sent back
805            isz1  - size of each message
806 
807   For better efficiency perhaps we should malloc separately each xdata[i],
808 then if a remalloc is required we need only copy the data for that one row
809 rather then all previous rows as it is now where a single large chunck of
810 memory is used.
811 
812 */
813 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
814 {
815   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
816   Mat            A  = c->A,B = c->B;
817   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
818   PetscErrorCode ierr;
819   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
820   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
821   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
822   PetscInt       *rbuf_i,kmax,rbuf_0;
823   PetscBT        xtable;
824 
825   PetscFunctionBegin;
826   m      = C->rmap->N;
827   rstart = C->rmap->rstart;
828   cstart = C->cmap->rstart;
829   ai     = a->i;
830   aj     = a->j;
831   bi     = b->i;
832   bj     = b->j;
833   garray = c->garray;
834 
835 
836   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
837     rbuf_i =  rbuf[i];
838     rbuf_0 =  rbuf_i[0];
839     ct    += rbuf_0;
840     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
841   }
842 
843   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
844   else max1 = 1;
845   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
846   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
847   ++no_malloc;
848   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
849   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
850 
851   ct3 = 0;
852   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
853     rbuf_i =  rbuf[i];
854     rbuf_0 =  rbuf_i[0];
855     ct1    =  2*rbuf_0+1;
856     ct2    =  ct1;
857     ct3   += ct1;
858     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
859       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
860       oct2 = ct2;
861       kmax = rbuf_i[2*j];
862       for (k=0; k<kmax; k++,ct1++) {
863         row = rbuf_i[ct1];
864         if (!PetscBTLookupSet(xtable,row)) {
865           if (!(ct3 < mem_estimate)) {
866             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
867             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
868             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
869             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
870             xdata[0]     = tmp;
871             mem_estimate = new_estimate; ++no_malloc;
872             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
873           }
874           xdata[i][ct2++] = row;
875           ct3++;
876         }
877       }
878       for (k=oct2,max2=ct2; k<max2; k++) {
879         row   = xdata[i][k] - rstart;
880         start = ai[row];
881         end   = ai[row+1];
882         for (l=start; l<end; l++) {
883           val = aj[l] + cstart;
884           if (!PetscBTLookupSet(xtable,val)) {
885             if (!(ct3 < mem_estimate)) {
886               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
887               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
888               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
889               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
890               xdata[0]     = tmp;
891               mem_estimate = new_estimate; ++no_malloc;
892               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
893             }
894             xdata[i][ct2++] = val;
895             ct3++;
896           }
897         }
898         start = bi[row];
899         end   = bi[row+1];
900         for (l=start; l<end; l++) {
901           val = garray[bj[l]];
902           if (!PetscBTLookupSet(xtable,val)) {
903             if (!(ct3 < mem_estimate)) {
904               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
905               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
906               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
907               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
908               xdata[0]     = tmp;
909               mem_estimate = new_estimate; ++no_malloc;
910               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
911             }
912             xdata[i][ct2++] = val;
913             ct3++;
914           }
915         }
916       }
917       /* Update the header*/
918       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
919       xdata[i][2*j-1] = rbuf_i[2*j-1];
920     }
921     xdata[i][0] = rbuf_0;
922     xdata[i+1]  = xdata[i] + ct2;
923     isz1[i]     = ct2; /* size of each message */
924   }
925   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
926   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
927   PetscFunctionReturn(0);
928 }
929 /* -------------------------------------------------------------------------*/
930 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
931 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
932 /*
933     Every processor gets the entire matrix
934 */
935 #undef __FUNCT__
936 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
937 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
938 {
939   Mat            B;
940   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
941   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
942   PetscErrorCode ierr;
943   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
944   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
945   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
946   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
947 
948   PetscFunctionBegin;
949   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
950   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
951 
952   if (scall == MAT_INITIAL_MATRIX) {
953     /* ----------------------------------------------------------------
954          Tell every processor the number of nonzeros per row
955     */
956     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
957     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
958       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
959     }
960     sendcount = A->rmap->rend - A->rmap->rstart;
961     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
962     for (i=0; i<size; i++) {
963       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
964       displs[i]     = A->rmap->range[i];
965     }
966 #if defined(PETSC_HAVE_MPI_IN_PLACE)
967     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
968 #else
969     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
970 #endif
971     /* ---------------------------------------------------------------
972          Create the sequential matrix of the same type as the local block diagonal
973     */
974     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
975     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
976     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
977     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
978     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
979     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
980     **Bin = B;
981     b     = (Mat_SeqAIJ*)B->data;
982 
983     /*--------------------------------------------------------------------
984        Copy my part of matrix column indices over
985     */
986     sendcount  = ad->nz + bd->nz;
987     jsendbuf   = b->j + b->i[rstarts[rank]];
988     a_jsendbuf = ad->j;
989     b_jsendbuf = bd->j;
990     n          = A->rmap->rend - A->rmap->rstart;
991     cnt        = 0;
992     for (i=0; i<n; i++) {
993 
994       /* put in lower diagonal portion */
995       m = bd->i[i+1] - bd->i[i];
996       while (m > 0) {
997         /* is it above diagonal (in bd (compressed) numbering) */
998         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
999         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1000         m--;
1001       }
1002 
1003       /* put in diagonal portion */
1004       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1005         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1006       }
1007 
1008       /* put in upper diagonal portion */
1009       while (m-- > 0) {
1010         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1011       }
1012     }
1013     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1014 
1015     /*--------------------------------------------------------------------
1016        Gather all column indices to all processors
1017     */
1018     for (i=0; i<size; i++) {
1019       recvcounts[i] = 0;
1020       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1021         recvcounts[i] += lens[j];
1022       }
1023     }
1024     displs[0] = 0;
1025     for (i=1; i<size; i++) {
1026       displs[i] = displs[i-1] + recvcounts[i-1];
1027     }
1028 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1029     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1030 #else
1031     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1032 #endif
1033     /*--------------------------------------------------------------------
1034         Assemble the matrix into useable form (note numerical values not yet set)
1035     */
1036     /* set the b->ilen (length of each row) values */
1037     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1038     /* set the b->i indices */
1039     b->i[0] = 0;
1040     for (i=1; i<=A->rmap->N; i++) {
1041       b->i[i] = b->i[i-1] + lens[i-1];
1042     }
1043     ierr = PetscFree(lens);CHKERRQ(ierr);
1044     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1045     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1046 
1047   } else {
1048     B = **Bin;
1049     b = (Mat_SeqAIJ*)B->data;
1050   }
1051 
1052   /*--------------------------------------------------------------------
1053        Copy my part of matrix numerical values into the values location
1054   */
1055   if (flag == MAT_GET_VALUES) {
1056     sendcount = ad->nz + bd->nz;
1057     sendbuf   = b->a + b->i[rstarts[rank]];
1058     a_sendbuf = ad->a;
1059     b_sendbuf = bd->a;
1060     b_sendj   = bd->j;
1061     n         = A->rmap->rend - A->rmap->rstart;
1062     cnt       = 0;
1063     for (i=0; i<n; i++) {
1064 
1065       /* put in lower diagonal portion */
1066       m = bd->i[i+1] - bd->i[i];
1067       while (m > 0) {
1068         /* is it above diagonal (in bd (compressed) numbering) */
1069         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1070         sendbuf[cnt++] = *b_sendbuf++;
1071         m--;
1072         b_sendj++;
1073       }
1074 
1075       /* put in diagonal portion */
1076       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1077         sendbuf[cnt++] = *a_sendbuf++;
1078       }
1079 
1080       /* put in upper diagonal portion */
1081       while (m-- > 0) {
1082         sendbuf[cnt++] = *b_sendbuf++;
1083         b_sendj++;
1084       }
1085     }
1086     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1087 
1088     /* -----------------------------------------------------------------
1089        Gather all numerical values to all processors
1090     */
1091     if (!recvcounts) {
1092       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1093     }
1094     for (i=0; i<size; i++) {
1095       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1096     }
1097     displs[0] = 0;
1098     for (i=1; i<size; i++) {
1099       displs[i] = displs[i-1] + recvcounts[i-1];
1100     }
1101     recvbuf = b->a;
1102 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1103     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1104 #else
1105     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1106 #endif
1107   }  /* endof (flag == MAT_GET_VALUES) */
1108   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1109 
1110   if (A->symmetric) {
1111     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1112   } else if (A->hermitian) {
1113     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1114   } else if (A->structurally_symmetric) {
1115     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1116   }
1117   PetscFunctionReturn(0);
1118 }
1119 
1120 
1121 
1122 #undef __FUNCT__
1123 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
1124 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1125 {
1126   PetscErrorCode ierr;
1127   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1128   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1129 
1130   PetscFunctionBegin;
1131 
1132   /*
1133        Check for special case: each processor gets entire matrix
1134   */
1135   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1136     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1137     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1138     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1139     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1140     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1141       wantallmatrix = PETSC_TRUE;
1142 
1143       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1144     }
1145   }
1146   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1147   if (twantallmatrix) {
1148     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
1149     PetscFunctionReturn(0);
1150   }
1151 
1152   /* Allocate memory to hold all the submatrices */
1153   if (scall != MAT_REUSE_MATRIX) {
1154     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
1155   }
1156 
1157   /* Check for special case: each processor gets entire matrix columns */
1158   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
1159   for (i=0; i<ismax; i++) {
1160     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
1161     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
1162     if (colflag && ncol == C->cmap->N) {
1163       allcolumns[i] = PETSC_TRUE;
1164     } else {
1165       allcolumns[i] = PETSC_FALSE;
1166     }
1167   }
1168 
1169   /* Determine the number of stages through which submatrices are done */
1170   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1171 
1172   /*
1173      Each stage will extract nmax submatrices.
1174      nmax is determined by the matrix column dimension.
1175      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1176   */
1177   if (!nmax) nmax = 1;
1178   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1179 
1180   /* Make sure every processor loops through the nstages */
1181   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1182 
1183   for (i=0,pos=0; i<nstages; i++) {
1184     if (pos+nmax <= ismax) max_no = nmax;
1185     else if (pos == ismax) max_no = 0;
1186     else                   max_no = ismax-pos;
1187     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
1188     pos += max_no;
1189   }
1190 
1191   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
1192   PetscFunctionReturn(0);
1193 }
1194 
1195 /* -------------------------------------------------------------------------*/
1196 #undef __FUNCT__
1197 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
1198 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1199 {
1200   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1201   Mat            A  = c->A;
1202   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1203   const PetscInt **icol,**irow;
1204   PetscInt       *nrow,*ncol,start;
1205   PetscErrorCode ierr;
1206   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1207   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1208   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1209   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1210   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1211 #if defined(PETSC_USE_CTABLE)
1212   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1213 #else
1214   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1215 #endif
1216   const PetscInt *irow_i;
1217   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1218   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1219   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1220   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1221   MPI_Status     *r_status3,*r_status4,*s_status4;
1222   MPI_Comm       comm;
1223   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1224   PetscMPIInt    *onodes1,*olengths1;
1225   PetscMPIInt    idex,idex2,end;
1226 
1227   PetscFunctionBegin;
1228   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1229   tag0 = ((PetscObject)C)->tag;
1230   size = c->size;
1231   rank = c->rank;
1232 
1233   /* Get some new tags to keep the communication clean */
1234   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1235   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1236   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1237 
1238   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
1239 
1240   for (i=0; i<ismax; i++) {
1241     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
1242     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
1243     if (allcolumns[i]) {
1244       icol[i] = NULL;
1245       ncol[i] = C->cmap->N;
1246     } else {
1247       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
1248       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
1249     }
1250   }
1251 
1252   /* evaluate communication - mesg to who, length of mesg, and buffer space
1253      required. Based on this, buffers are allocated, and data copied into them*/
1254   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
1255   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1256   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1257   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1258   for (i=0; i<ismax; i++) {
1259     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
1260     jmax   = nrow[i];
1261     irow_i = irow[i];
1262     for (j=0; j<jmax; j++) {
1263       l   = 0;
1264       row = irow_i[j];
1265       while (row >= C->rmap->range[l+1]) l++;
1266       proc = l;
1267       w4[proc]++;
1268     }
1269     for (j=0; j<size; j++) {
1270       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1271     }
1272   }
1273 
1274   nrqs     = 0;              /* no of outgoing messages */
1275   msz      = 0;              /* total mesg length (for all procs) */
1276   w1[rank] = 0;              /* no mesg sent to self */
1277   w3[rank] = 0;
1278   for (i=0; i<size; i++) {
1279     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1280   }
1281   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1282   for (i=0,j=0; i<size; i++) {
1283     if (w1[i]) { pa[j] = i; j++; }
1284   }
1285 
1286   /* Each message would have a header = 1 + 2*(no of IS) + data */
1287   for (i=0; i<nrqs; i++) {
1288     j      = pa[i];
1289     w1[j] += w2[j] + 2* w3[j];
1290     msz   += w1[j];
1291   }
1292   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1293 
1294   /* Determine the number of messages to expect, their lengths, from from-ids */
1295   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1296   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1297 
1298   /* Now post the Irecvs corresponding to these messages */
1299   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1300 
1301   ierr = PetscFree(onodes1);CHKERRQ(ierr);
1302   ierr = PetscFree(olengths1);CHKERRQ(ierr);
1303 
1304   /* Allocate Memory for outgoing messages */
1305   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1306   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1307   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1308 
1309   {
1310     PetscInt *iptr = tmp,ict = 0;
1311     for (i=0; i<nrqs; i++) {
1312       j        = pa[i];
1313       iptr    += ict;
1314       sbuf1[j] = iptr;
1315       ict      = w1[j];
1316     }
1317   }
1318 
1319   /* Form the outgoing messages */
1320   /* Initialize the header space */
1321   for (i=0; i<nrqs; i++) {
1322     j           = pa[i];
1323     sbuf1[j][0] = 0;
1324     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
1325     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1326   }
1327 
1328   /* Parse the isrow and copy data into outbuf */
1329   for (i=0; i<ismax; i++) {
1330     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1331     irow_i = irow[i];
1332     jmax   = nrow[i];
1333     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1334       l   = 0;
1335       row = irow_i[j];
1336       while (row >= C->rmap->range[l+1]) l++;
1337       proc = l;
1338       if (proc != rank) { /* copy to the outgoing buf*/
1339         ctr[proc]++;
1340         *ptr[proc] = row;
1341         ptr[proc]++;
1342       }
1343     }
1344     /* Update the headers for the current IS */
1345     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1346       if ((ctr_j = ctr[j])) {
1347         sbuf1_j        = sbuf1[j];
1348         k              = ++sbuf1_j[0];
1349         sbuf1_j[2*k]   = ctr_j;
1350         sbuf1_j[2*k-1] = i;
1351       }
1352     }
1353   }
1354 
1355   /*  Now  post the sends */
1356   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1357   for (i=0; i<nrqs; ++i) {
1358     j    = pa[i];
1359     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
1360   }
1361 
1362   /* Post Receives to capture the buffer size */
1363   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
1364   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
1365   rbuf2[0] = tmp + msz;
1366   for (i=1; i<nrqs; ++i) {
1367     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1368   }
1369   for (i=0; i<nrqs; ++i) {
1370     j    = pa[i];
1371     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
1372   }
1373 
1374   /* Send to other procs the buf size they should allocate */
1375 
1376 
1377   /* Receive messages*/
1378   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
1379   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1380   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
1381   {
1382     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1383     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1384     PetscInt   *sbuf2_i;
1385 
1386     for (i=0; i<nrqr; ++i) {
1387       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1388 
1389       req_size[idex] = 0;
1390       rbuf1_i        = rbuf1[idex];
1391       start          = 2*rbuf1_i[0] + 1;
1392       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1393       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
1394       sbuf2_i        = sbuf2[idex];
1395       for (j=start; j<end; j++) {
1396         id              = rbuf1_i[j] - rstart;
1397         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1398         sbuf2_i[j]      = ncols;
1399         req_size[idex] += ncols;
1400       }
1401       req_source[idex] = r_status1[i].MPI_SOURCE;
1402       /* form the header */
1403       sbuf2_i[0] = req_size[idex];
1404       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1405 
1406       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1407     }
1408   }
1409   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1410   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1411 
1412   /*  recv buffer sizes */
1413   /* Receive messages*/
1414 
1415   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1416   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1417   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1418   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1419   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1420 
1421   for (i=0; i<nrqs; ++i) {
1422     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1423     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1424     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1425     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1426     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1427   }
1428   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1429   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1430 
1431   /* Wait on sends1 and sends2 */
1432   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1433   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1434 
1435   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1436   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1437   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1438   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1439   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1440   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1441 
1442   /* Now allocate buffers for a->j, and send them off */
1443   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1444   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1445   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1446   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1447 
1448   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1449   {
1450     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1451     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1452     PetscInt cend = C->cmap->rend;
1453     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1454 
1455     for (i=0; i<nrqr; i++) {
1456       rbuf1_i   = rbuf1[i];
1457       sbuf_aj_i = sbuf_aj[i];
1458       ct1       = 2*rbuf1_i[0] + 1;
1459       ct2       = 0;
1460       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1461         kmax = rbuf1[i][2*j];
1462         for (k=0; k<kmax; k++,ct1++) {
1463           row    = rbuf1_i[ct1] - rstart;
1464           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1465           ncols  = nzA + nzB;
1466           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1467 
1468           /* load the column indices for this row into cols*/
1469           cols = sbuf_aj_i + ct2;
1470 
1471           lwrite = 0;
1472           for (l=0; l<nzB; l++) {
1473             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1474           }
1475           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1476           for (l=0; l<nzB; l++) {
1477             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1478           }
1479 
1480           ct2 += ncols;
1481         }
1482       }
1483       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1484     }
1485   }
1486   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1487   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1488 
1489   /* Allocate buffers for a->a, and send them off */
1490   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1491   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1492   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1493   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1494 
1495   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1496   {
1497     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1498     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1499     PetscInt    cend   = C->cmap->rend;
1500     PetscInt    *b_j   = b->j;
1501     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1502 
1503     for (i=0; i<nrqr; i++) {
1504       rbuf1_i   = rbuf1[i];
1505       sbuf_aa_i = sbuf_aa[i];
1506       ct1       = 2*rbuf1_i[0]+1;
1507       ct2       = 0;
1508       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1509         kmax = rbuf1_i[2*j];
1510         for (k=0; k<kmax; k++,ct1++) {
1511           row    = rbuf1_i[ct1] - rstart;
1512           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1513           ncols  = nzA + nzB;
1514           cworkB = b_j + b_i[row];
1515           vworkA = a_a + a_i[row];
1516           vworkB = b_a + b_i[row];
1517 
1518           /* load the column values for this row into vals*/
1519           vals = sbuf_aa_i+ct2;
1520 
1521           lwrite = 0;
1522           for (l=0; l<nzB; l++) {
1523             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1524           }
1525           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1526           for (l=0; l<nzB; l++) {
1527             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1528           }
1529 
1530           ct2 += ncols;
1531         }
1532       }
1533       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1534     }
1535   }
1536   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1537   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1538   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1539   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1540 
1541   /* Form the matrix */
1542   /* create col map: global col of C -> local col of submatrices */
1543   {
1544     const PetscInt *icol_i;
1545 #if defined(PETSC_USE_CTABLE)
1546     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1547     for (i=0; i<ismax; i++) {
1548       if (!allcolumns[i]) {
1549         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1550 
1551         jmax   = ncol[i];
1552         icol_i = icol[i];
1553         cmap_i = cmap[i];
1554         for (j=0; j<jmax; j++) {
1555           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1556         }
1557       } else {
1558         cmap[i] = NULL;
1559       }
1560     }
1561 #else
1562     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1563     for (i=0; i<ismax; i++) {
1564       if (!allcolumns[i]) {
1565         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1566         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1567         jmax   = ncol[i];
1568         icol_i = icol[i];
1569         cmap_i = cmap[i];
1570         for (j=0; j<jmax; j++) {
1571           cmap_i[icol_i[j]] = j+1;
1572         }
1573       } else {
1574         cmap[i] = NULL;
1575       }
1576     }
1577 #endif
1578   }
1579 
1580   /* Create lens which is required for MatCreate... */
1581   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1582   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1583   if (ismax) {
1584     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1585     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1586   }
1587   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1588 
1589   /* Update lens from local data */
1590   for (i=0; i<ismax; i++) {
1591     jmax = nrow[i];
1592     if (!allcolumns[i]) cmap_i = cmap[i];
1593     irow_i = irow[i];
1594     lens_i = lens[i];
1595     for (j=0; j<jmax; j++) {
1596       l   = 0;
1597       row = irow_i[j];
1598       while (row >= C->rmap->range[l+1]) l++;
1599       proc = l;
1600       if (proc == rank) {
1601         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1602         if (!allcolumns[i]) {
1603           for (k=0; k<ncols; k++) {
1604 #if defined(PETSC_USE_CTABLE)
1605             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1606 #else
1607             tcol = cmap_i[cols[k]];
1608 #endif
1609             if (tcol) lens_i[j]++;
1610           }
1611         } else { /* allcolumns */
1612           lens_i[j] = ncols;
1613         }
1614         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1615       }
1616     }
1617   }
1618 
1619   /* Create row map: global row of C -> local row of submatrices */
1620 #if defined(PETSC_USE_CTABLE)
1621   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1622   for (i=0; i<ismax; i++) {
1623     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1624     rmap_i = rmap[i];
1625     irow_i = irow[i];
1626     jmax   = nrow[i];
1627     for (j=0; j<jmax; j++) {
1628       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1629     }
1630   }
1631 #else
1632   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1633   if (ismax) {
1634     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1635     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1636   }
1637   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1638   for (i=0; i<ismax; i++) {
1639     rmap_i = rmap[i];
1640     irow_i = irow[i];
1641     jmax   = nrow[i];
1642     for (j=0; j<jmax; j++) {
1643       rmap_i[irow_i[j]] = j;
1644     }
1645   }
1646 #endif
1647 
1648   /* Update lens from offproc data */
1649   {
1650     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1651 
1652     for (tmp2=0; tmp2<nrqs; tmp2++) {
1653       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1654       idex    = pa[idex2];
1655       sbuf1_i = sbuf1[idex];
1656       jmax    = sbuf1_i[0];
1657       ct1     = 2*jmax+1;
1658       ct2     = 0;
1659       rbuf2_i = rbuf2[idex2];
1660       rbuf3_i = rbuf3[idex2];
1661       for (j=1; j<=jmax; j++) {
1662         is_no  = sbuf1_i[2*j-1];
1663         max1   = sbuf1_i[2*j];
1664         lens_i = lens[is_no];
1665         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1666         rmap_i = rmap[is_no];
1667         for (k=0; k<max1; k++,ct1++) {
1668 #if defined(PETSC_USE_CTABLE)
1669           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1670           row--;
1671           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1672 #else
1673           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1674 #endif
1675           max2 = rbuf2_i[ct1];
1676           for (l=0; l<max2; l++,ct2++) {
1677             if (!allcolumns[is_no]) {
1678 #if defined(PETSC_USE_CTABLE)
1679               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1680 #else
1681               tcol = cmap_i[rbuf3_i[ct2]];
1682 #endif
1683               if (tcol) lens_i[row]++;
1684             } else { /* allcolumns */
1685               lens_i[row]++; /* lens_i[row] += max2 ? */
1686             }
1687           }
1688         }
1689       }
1690     }
1691   }
1692   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1693   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1694   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1695   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1696   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1697 
1698   /* Create the submatrices */
1699   if (scall == MAT_REUSE_MATRIX) {
1700     PetscBool flag;
1701 
1702     /*
1703         Assumes new rows are same length as the old rows,hence bug!
1704     */
1705     for (i=0; i<ismax; i++) {
1706       mat = (Mat_SeqAIJ*)(submats[i]->data);
1707       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1708       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1709       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1710       /* Initial matrix as if empty */
1711       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1712 
1713       submats[i]->factortype = C->factortype;
1714     }
1715   } else {
1716     for (i=0; i<ismax; i++) {
1717       PetscInt rbs,cbs;
1718       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1719       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1720 
1721       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1722       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1723 
1724       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1725       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1726       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1727     }
1728   }
1729 
1730   /* Assemble the matrices */
1731   /* First assemble the local rows */
1732   {
1733     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1734     PetscScalar *imat_a;
1735 
1736     for (i=0; i<ismax; i++) {
1737       mat       = (Mat_SeqAIJ*)submats[i]->data;
1738       imat_ilen = mat->ilen;
1739       imat_j    = mat->j;
1740       imat_i    = mat->i;
1741       imat_a    = mat->a;
1742 
1743       if (!allcolumns[i]) cmap_i = cmap[i];
1744       rmap_i = rmap[i];
1745       irow_i = irow[i];
1746       jmax   = nrow[i];
1747       for (j=0; j<jmax; j++) {
1748         l   = 0;
1749         row = irow_i[j];
1750         while (row >= C->rmap->range[l+1]) l++;
1751         proc = l;
1752         if (proc == rank) {
1753           old_row = row;
1754 #if defined(PETSC_USE_CTABLE)
1755           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1756           row--;
1757 #else
1758           row = rmap_i[row];
1759 #endif
1760           ilen_row = imat_ilen[row];
1761           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1762           mat_i    = imat_i[row];
1763           mat_a    = imat_a + mat_i;
1764           mat_j    = imat_j + mat_i;
1765           if (!allcolumns[i]) {
1766             for (k=0; k<ncols; k++) {
1767 #if defined(PETSC_USE_CTABLE)
1768               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1769 #else
1770               tcol = cmap_i[cols[k]];
1771 #endif
1772               if (tcol) {
1773                 *mat_j++ = tcol - 1;
1774                 *mat_a++ = vals[k];
1775                 ilen_row++;
1776               }
1777             }
1778           } else { /* allcolumns */
1779             for (k=0; k<ncols; k++) {
1780               *mat_j++ = cols[k];  /* global col index! */
1781               *mat_a++ = vals[k];
1782               ilen_row++;
1783             }
1784           }
1785           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1786 
1787           imat_ilen[row] = ilen_row;
1788         }
1789       }
1790     }
1791   }
1792 
1793   /*   Now assemble the off proc rows*/
1794   {
1795     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1796     PetscInt    *imat_j,*imat_i;
1797     PetscScalar *imat_a,*rbuf4_i;
1798 
1799     for (tmp2=0; tmp2<nrqs; tmp2++) {
1800       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1801       idex    = pa[idex2];
1802       sbuf1_i = sbuf1[idex];
1803       jmax    = sbuf1_i[0];
1804       ct1     = 2*jmax + 1;
1805       ct2     = 0;
1806       rbuf2_i = rbuf2[idex2];
1807       rbuf3_i = rbuf3[idex2];
1808       rbuf4_i = rbuf4[idex2];
1809       for (j=1; j<=jmax; j++) {
1810         is_no     = sbuf1_i[2*j-1];
1811         rmap_i    = rmap[is_no];
1812         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1813         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1814         imat_ilen = mat->ilen;
1815         imat_j    = mat->j;
1816         imat_i    = mat->i;
1817         imat_a    = mat->a;
1818         max1      = sbuf1_i[2*j];
1819         for (k=0; k<max1; k++,ct1++) {
1820           row = sbuf1_i[ct1];
1821 #if defined(PETSC_USE_CTABLE)
1822           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1823           row--;
1824 #else
1825           row = rmap_i[row];
1826 #endif
1827           ilen  = imat_ilen[row];
1828           mat_i = imat_i[row];
1829           mat_a = imat_a + mat_i;
1830           mat_j = imat_j + mat_i;
1831           max2  = rbuf2_i[ct1];
1832           if (!allcolumns[is_no]) {
1833             for (l=0; l<max2; l++,ct2++) {
1834 
1835 #if defined(PETSC_USE_CTABLE)
1836               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1837 #else
1838               tcol = cmap_i[rbuf3_i[ct2]];
1839 #endif
1840               if (tcol) {
1841                 *mat_j++ = tcol - 1;
1842                 *mat_a++ = rbuf4_i[ct2];
1843                 ilen++;
1844               }
1845             }
1846           } else { /* allcolumns */
1847             for (l=0; l<max2; l++,ct2++) {
1848               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1849               *mat_a++ = rbuf4_i[ct2];
1850               ilen++;
1851             }
1852           }
1853           imat_ilen[row] = ilen;
1854         }
1855       }
1856     }
1857   }
1858 
1859   /* sort the rows */
1860   {
1861     PetscInt    *imat_ilen,*imat_j,*imat_i;
1862     PetscScalar *imat_a;
1863 
1864     for (i=0; i<ismax; i++) {
1865       mat       = (Mat_SeqAIJ*)submats[i]->data;
1866       imat_j    = mat->j;
1867       imat_i    = mat->i;
1868       imat_a    = mat->a;
1869       imat_ilen = mat->ilen;
1870 
1871       if (allcolumns[i]) continue;
1872       jmax = nrow[i];
1873       for (j=0; j<jmax; j++) {
1874         PetscInt ilen;
1875 
1876         mat_i = imat_i[j];
1877         mat_a = imat_a + mat_i;
1878         mat_j = imat_j + mat_i;
1879         ilen  = imat_ilen[j];
1880         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
1881       }
1882     }
1883   }
1884 
1885   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1886   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1887   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1888   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1889   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1890 
1891   /* Restore the indices */
1892   for (i=0; i<ismax; i++) {
1893     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1894     if (!allcolumns[i]) {
1895       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1896     }
1897   }
1898 
1899   /* Destroy allocated memory */
1900   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1901   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1902   ierr = PetscFree(pa);CHKERRQ(ierr);
1903 
1904   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1905   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1906   for (i=0; i<nrqr; ++i) {
1907     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1908   }
1909   for (i=0; i<nrqs; ++i) {
1910     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1911     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1912   }
1913 
1914   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1915   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1916   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1917   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1918   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1919   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1920   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1921 
1922 #if defined(PETSC_USE_CTABLE)
1923   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1924 #else
1925   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1926 #endif
1927   ierr = PetscFree(rmap);CHKERRQ(ierr);
1928 
1929   for (i=0; i<ismax; i++) {
1930     if (!allcolumns[i]) {
1931 #if defined(PETSC_USE_CTABLE)
1932       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1933 #else
1934       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1935 #endif
1936     }
1937   }
1938   ierr = PetscFree(cmap);CHKERRQ(ierr);
1939   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1940   ierr = PetscFree(lens);CHKERRQ(ierr);
1941 
1942   for (i=0; i<ismax; i++) {
1943     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1944     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1945   }
1946   PetscFunctionReturn(0);
1947 }
1948 
1949 /*
1950  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1951  Be careful not to destroy them elsewhere.
1952  */
1953 #undef __FUNCT__
1954 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1955 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1956 {
1957   /* If making this function public, change the error returned in this function away from _PLIB. */
1958   PetscErrorCode ierr;
1959   Mat_MPIAIJ     *aij;
1960   PetscBool      seqaij;
1961 
1962   PetscFunctionBegin;
1963   /* Check to make sure the component matrices are compatible with C. */
1964   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1965   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1966   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1967   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1968   if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1969 
1970   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1971   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1972   ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr);
1973   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1974   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1975   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1976   ierr   = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1977   aij    = (Mat_MPIAIJ*)((*C)->data);
1978   aij->A = A;
1979   aij->B = B;
1980   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr);
1981   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr);
1982 
1983   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1984   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1985   PetscFunctionReturn(0);
1986 }
1987 
1988 #undef __FUNCT__
1989 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1990 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1991 {
1992   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1993 
1994   PetscFunctionBegin;
1995   PetscValidPointer(A,2);
1996   PetscValidPointer(B,3);
1997   *A = aij->A;
1998   *B = aij->B;
1999   /* Note that we don't incref *A and *B, so be careful! */
2000   PetscFunctionReturn(0);
2001 }
2002 
2003 #undef __FUNCT__
2004 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
2005 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
2006                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
2007                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
2008                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
2009 {
2010   PetscErrorCode ierr;
2011   PetscMPIInt    size, flag;
2012   PetscInt       i,ii;
2013   PetscInt       ismax_c;
2014 
2015   PetscFunctionBegin;
2016   if (!ismax) PetscFunctionReturn(0);
2017 
2018   for (i = 0, ismax_c = 0; i < ismax; ++i) {
2019     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
2020     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2021     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2022     if (size > 1) ++ismax_c;
2023   }
2024   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
2025     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2026   } else { /* if (ismax_c) */
2027     Mat         *A,*B;
2028     IS          *isrow_c, *iscol_c;
2029     PetscMPIInt size;
2030     /*
2031      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2032      array of sequential matrices underlying the resulting parallel matrices.
2033      Which arrays to allocate is based on the value of MatReuse scall.
2034      There are as many diag matrices as there are original index sets.
2035      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
2036 
2037      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
2038      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
2039      will deposite the extracted diag and off-diag parts.
2040      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
2041      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
2042     */
2043 
2044     /* Parallel matrix array is allocated only if no reuse is taking place. */
2045     if (scall != MAT_REUSE_MATRIX) {
2046       ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
2047     } else {
2048       ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr);
2049       ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr);
2050       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
2051       for (i = 0, ii = 0; i < ismax; ++i) {
2052         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2053         if (size > 1) {
2054           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
2055           ++ii;
2056         } else A[i] = (*submat)[i];
2057       }
2058     }
2059     /*
2060      Construct the complements of the iscol ISs for parallel ISs only.
2061      These are used to extract the off-diag portion of the resulting parallel matrix.
2062      The row IS for the off-diag portion is the same as for the diag portion,
2063      so we merely alias the row IS, while skipping those that are sequential.
2064     */
2065     ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr);
2066     for (i = 0, ii = 0; i < ismax; ++i) {
2067       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2068       if (size > 1) {
2069         isrow_c[ii] = isrow[i];
2070 
2071         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
2072         ++ii;
2073       }
2074     }
2075     /* Now obtain the sequential A and B submatrices separately. */
2076     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
2077     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
2078     for (ii = 0; ii < ismax_c; ++ii) {
2079       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
2080     }
2081     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
2082     /*
2083      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
2084      have been extracted directly into the parallel matrices containing them, or
2085      simply into the sequential matrix identical with the corresponding A (if size == 1).
2086      Otherwise, make sure that parallel matrices are constructed from A & B, or the
2087      A is put into the correct submat slot (if size == 1).
2088      */
2089     if (scall != MAT_REUSE_MATRIX) {
2090       for (i = 0, ii = 0; i < ismax; ++i) {
2091         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2092         if (size > 1) {
2093           /*
2094            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
2095            */
2096           /* Construct submat[i] from the Seq pieces A and B. */
2097           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
2098 
2099           ++ii;
2100         } else (*submat)[i] = A[i];
2101       }
2102     }
2103     ierr = PetscFree(A);CHKERRQ(ierr);
2104     ierr = PetscFree(B);CHKERRQ(ierr);
2105   }
2106   PetscFunctionReturn(0);
2107 } /* MatGetSubMatricesParallel_MPIXAIJ() */
2108 
2109 
2110 
2111 #undef __FUNCT__
2112 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
2113 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2114 {
2115   PetscErrorCode ierr;
2116 
2117   PetscFunctionBegin;
2118   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
2119   PetscFunctionReturn(0);
2120 }
2121