xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 9d01615f782afd7a63a1dcd7f2ad02f7ce96d6f5)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_K(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_K(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_K(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_K(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscBool      mat_overlap_k,set;
28   PetscInt       i;
29 
30   PetscFunctionBegin;
31   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
32   ierr = PetscOptionsGetBool(NULL,"-mat_overlap_k",&mat_overlap_k,&set);CHKERRQ(ierr);
33   if(!set) mat_overlap_k = PETSC_FALSE;
34   for (i=0; i<ov; ++i) {
35     if(mat_overlap_k){
36       ierr = MatIncreaseOverlap_MPIAIJ_Once_K(C,imax,is);CHKERRQ(ierr);
37     }else{
38       ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
39     }
40   }
41   PetscFunctionReturn(0);
42 }
43 
44 
45 #undef __FUNCT__
46 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_K"
47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_K(Mat mat,PetscInt nidx,IS is[])
48 {
49   PetscErrorCode   ierr;
50   MPI_Comm         comm;
51   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
52   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
53   PetscInt         nrecvrows,*sbsizes,*sbdata,nto,nfrom;
54   const PetscInt **indices,*indices_i;
55   PetscLayout      rmap;
56   PetscMPIInt      rank,size,*toranks,*fromranks;
57   PetscSF          sf;
58   PetscSFNode     *remote;
59 
60   PetscFunctionBegin;
61   /*communicator */
62   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
63   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
64   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
65   /*row map*/
66   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
67   /*retrieve IS data*/
68   ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr);
69   /*get length and indices*/
70   for(i=0,tlength=0; i<nidx; i++){
71     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
72     tlength += length[i];
73     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
74   }
75   /*find these rows on remote processors */
76   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
77   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
78   nrrows = 0;
79   for(i=0; i<nidx; i++){
80 	length_i     = length[i];
81 	indices_i    = indices[i];
82     for(j=0; j<length_i; j++){
83       owner = -1;
84       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
85       /*remote processors*/
86       if(owner != rank){
87     	tosizes_temp[owner]++; /*number of rows to owner*/
88     	rrow_ranks[nrrows]  = owner; /*processor */
89     	rrow_isids[nrrows]   = i; /*is id*/
90         remoterows[nrrows++] = indices_i[j]; /* row */
91       }
92     }
93     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
94   }
95   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
96   /*test if we need to exchange messages
97    * generally speaking, we do not need to exchange
98    * data when overlap is 1
99    * */
100   ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
101   /*we do not have any messages
102    * It usually corresponds to overlap 1
103    * */
104   if(!reducednrrows){
105 	ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
106 	ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
107 	ierr = MatIncreaseOverlap_MPIAIJ_Local_K(mat,nidx,is);CHKERRQ(ierr);
108 	PetscFunctionReturn(0);
109   }
110   nto = 0;
111   /*send sizes and ranks*/
112   for(i=0; i<size; i++){
113    if(tosizes_temp[i]){
114     tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
115     tosizes_temp[i] = nto; /* a map from processor to index */
116 	toranks[nto++]  = i; /* processor */
117    }
118   }
119   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
120   for(i=0; i<nto; i++){
121 	toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /*offsets*/
122 	tosizes[2*i+1] = toffsets[i]; /*offsets to send*/
123   }
124   /*send information to other processors*/
125   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
126   /*build a star forest */
127   nrecvrows = 0;
128   for(i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
130   nrecvrows = 0;
131   for(i=0; i<nfrom; i++){
132 	for(j=0; j<fromsizes[2*i]; j++){
133 	  remote[nrecvrows].rank    = fromranks[i];
134       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135 	}
136   }
137   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
138   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
139   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
140   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
141   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
142   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
143   /*message pair <no of is, row> */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for(i=0; i<nrrows; i++){
146 	owner = rrow_ranks[i]; /* processor */
147 	j     = tosizes_temp[owner]; /* index */
148 	todata[toffsets[j]++] = rrow_isids[i];
149 	todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /*deal with remote data */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_K(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for(i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for(i=0; i<nto; i++){
169   	for(j=0; j<tosizes[2*i]; j++){
170   	  remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172   	}
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /*overlap communication and computation*/
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_K(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
184   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
185   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
186   ierr = MatIncreaseOverlap_MPIAIJ_Receive_K(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
187   ierr = PetscFree(toranks);CHKERRQ(ierr);
188   ierr = PetscFree(tosizes);CHKERRQ(ierr);
189   ierr = PetscFree(todata);CHKERRQ(ierr);
190   PetscFunctionReturn(0);
191 }
192 
193 #undef __FUNCT__
194 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_K"
195 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_K(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
196 {
197   PetscInt         *isz,isz_i,i,j,is_id, data_size;
198   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
199   const PetscInt   *indices_i_temp;
200   PetscErrorCode    ierr;
201 
202   PetscFunctionBegin;
203   max_lsize = 0;
204   ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr);
205   for(i=0; i<nidx; i++){
206    ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
207    max_lsize = lsize>max_lsize ? lsize:max_lsize;
208    isz[i]    = lsize;
209   }
210   ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr);
211   for(i=0; i<nidx; i++){
212     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213 	ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
214 	ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
215 	ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
216   }
217   /*retrieve information */
218   for(i=0; i<nrecvs; ){
219 	is_id      = recvdata[i++];
220 	data_size  = recvdata[i++];
221 	indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
222 	isz_i      = isz[is_id];
223     for(j=0; j< data_size; j++){
224       col = recvdata[i++];
225       indices_i[isz_i++] = col;
226     }
227     isz[is_id] = isz_i;
228   }
229   /*remove duplicate entities*/
230   for(i=0; i<nidx; i++){
231    indices_i  = indices_temp+(max_lsize+nrecvs)*i;
232    isz_i      = isz[i];
233    ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
234    ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
235   }
236   ierr = PetscFree(isz);CHKERRQ(ierr);
237   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 #undef __FUNCT__
242 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_K"
243 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_K(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
244 {
245   PetscLayout       rmap,cmap;
246   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
247   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
248   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
249   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
250   Mat               amat,bmat;
251   PetscMPIInt       rank;
252   PetscBool         done;
253   MPI_Comm          comm;
254   PetscErrorCode    ierr;
255 
256   PetscFunctionBegin;
257   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
258   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
259   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
260   /* Even if the mat is symmetric, we still assume it is not symmetric*/
261   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
262   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
263   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
264   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
265   /*total number of nonzero values */
266   tnz  = ai[an]+bi[bn];
267   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
268   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
269   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
270   /*longest message */
271   max_fszs = 0;
272   for(i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
273   /*better way to estimate number of nonzero in the mat???*/
274   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
275   for(i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
276   rows_pos  = 0;
277   totalrows = 0;
278   for(i=0; i<nfrom; i++){
279 	ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
280 	/*group data*/
281     for(j=0; j<fromsizes[2*i]; j+=2){
282       is_id                       = fromrows[rows_pos++];/*no of is*/
283       rows_i                      = rows_data[is_id];
284       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
285     }
286     /*estimate a space to avoid multiple allocations  */
287     for(j=0; j<nidx; j++){
288       indvc_ij = 0;
289       rows_i   = rows_data[j];
290       for(l=0; l<rows_pos_i[j]; l++){
291         row    = rows_i[l]-rstart;
292         start  = ai[row];
293         end    = ai[row+1];
294         for(k=start; k<end; k++){ /* Amat */
295           col = aj[k] + cstart;
296           /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr);
297           if(owner != fromranks[i])*/
298           indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/
299         }
300         start = bi[row];
301         end   = bi[row+1];
302         for(k=start; k<end; k++) { /* Bmat */
303           col = gcols[bj[k]];
304           /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr);
305           if(owner != fromranks[i]) */
306           indices_tmp[indvc_ij++] = col;
307         }
308       }
309       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
310       indv_counts[i*nidx+j] = indvc_ij;
311       totalrows            += indvc_ij;
312     }
313   }
314   /*message triple <no of is, number of rows, rows> */
315   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
316   totalrows = 0;
317   rows_pos  = 0;
318   /* use this code again */
319   for(i=0;i<nfrom;i++){
320   	ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
321     for(j=0; j<fromsizes[2*i]; j+=2){
322       is_id                       = fromrows[rows_pos++];
323       rows_i                      = rows_data[is_id];
324       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
325     }
326     /* add data  */
327     for(j=0; j<nidx; j++){
328       if(!indv_counts[i*nidx+j]) continue;
329       indvc_ij = 0;
330       sbdata[totalrows++] = j;
331       sbdata[totalrows++] = indv_counts[i*nidx+j];
332       sbsizes[2*i]       += 2;
333       rows_i              = rows_data[j];
334       for(l=0; l<rows_pos_i[j]; l++){
335         row   = rows_i[l]-rstart;
336         start = ai[row];
337         end   = ai[row+1];
338         for(k=start; k<end; k++){ /* Amat */
339           col = aj[k] + cstart;
340           indices_tmp[indvc_ij++] = col;
341         }
342         start = bi[row];
343         end   = bi[row+1];
344         for(k=start; k<end; k++) { /* Bmat */
345           col = gcols[bj[k]];
346           indices_tmp[indvc_ij++] = col;
347         }
348       }
349       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
350       sbsizes[2*i]  += indvc_ij;
351       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
352       totalrows += indvc_ij;
353     }
354   }
355   /* offsets */
356   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
357   for(i=0; i<nfrom; i++){
358 	offsets[i+1]   = offsets[i] + sbsizes[2*i];
359 	sbsizes[2*i+1] = offsets[i];
360   }
361   ierr = PetscFree(offsets);CHKERRQ(ierr);
362   if(sbrowsizes) *sbrowsizes = sbsizes;
363   if(sbrows) *sbrows = sbdata;
364   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
365   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_K"
372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_K(Mat mat,PetscInt nidx, IS is[])
373 {
374   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
375   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
376   PetscInt          lsize,lsize_tmp,owner;
377   PetscMPIInt       rank;
378   Mat   	        amat,bmat;
379   PetscBool         done;
380   PetscLayout       cmap,rmap;
381   MPI_Comm          comm;
382   PetscErrorCode    ierr;
383 
384   PetscFunctionBegin;
385   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
386   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
387   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
388   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
389   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
390   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
391   if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
392   /*is it a safe way to compute number of nonzero values ?*/
393   tnz  = ai[an]+bi[bn];
394   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
395   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
397   /*it is a better way to estimate memory than the old implementation
398    * where global size of matrix is used
399    * */
400   ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr);
401   for(i=0; i<nidx; i++) {
402 	ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
403 	ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
404 	lsize_tmp = 0;
405     for(j=0; j<lsize; j++) {
406        owner = -1;
407        row   = indices[j];
408        ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
409        if(owner != rank) continue;
410        /*local number*/
411        row  -= rstart;
412        start = ai[row];
413        end   = ai[row+1];
414        for(k=start; k<end; k++) { /* Amat */
415          col = aj[k] + cstart;
416          indices_temp[lsize_tmp++] = col;
417        }
418        start = bi[row];
419        end   = bi[row+1];
420        for(k=start; k<end; k++) { /* Bmat */
421          col = gcols[bj[k]];
422          indices_temp[lsize_tmp++] = col;
423        }
424     }
425    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
426    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
427    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
428    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
429   }
430   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
431   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 
436 
437 /*
438   Sample message format:
439   If a processor A wants processor B to process some elements corresponding
440   to index sets is[1],is[5]
441   mesg [0] = 2   (no of index sets in the mesg)
442   -----------
443   mesg [1] = 1 => is[1]
444   mesg [2] = sizeof(is[1]);
445   -----------
446   mesg [3] = 5  => is[5]
447   mesg [4] = sizeof(is[5]);
448   -----------
449   mesg [5]
450   mesg [n]  datas[1]
451   -----------
452   mesg[n+1]
453   mesg[m]  data(is[5])
454   -----------
455 
456   Notes:
457   nrqs - no of requests sent (or to be sent out)
458   nrqr - no of requests recieved (which have to be or which have been processed
459 */
460 #undef __FUNCT__
461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
463 {
464   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
465   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
466   const PetscInt **idx,*idx_i;
467   PetscInt       *n,**data,len;
468   PetscErrorCode ierr;
469   PetscMPIInt    size,rank,tag1,tag2;
470   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
471   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
472   PetscBT        *table;
473   MPI_Comm       comm;
474   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
475   MPI_Status     *s_status,*recv_status;
476   char           *t_p;
477 
478   PetscFunctionBegin;
479   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
480   size = c->size;
481   rank = c->rank;
482   M    = C->rmap->N;
483 
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
485   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
486 
487   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
488 
489   for (i=0; i<imax; i++) {
490     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
491     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
492   }
493 
494   /* evaluate communication - mesg to who,length of mesg, and buffer space
495      required. Based on this, buffers are allocated, and data copied into them*/
496   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
497   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
498   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
499   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
500   for (i=0; i<imax; i++) {
501     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
502     idx_i = idx[i];
503     len   = n[i];
504     for (j=0; j<len; j++) {
505       row = idx_i[j];
506       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
507       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
508       w4[proc]++;
509     }
510     for (j=0; j<size; j++) {
511       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
512     }
513   }
514 
515   nrqs     = 0;              /* no of outgoing messages */
516   msz      = 0;              /* total mesg length (for all proc */
517   w1[rank] = 0;              /* no mesg sent to intself */
518   w3[rank] = 0;
519   for (i=0; i<size; i++) {
520     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
521   }
522   /* pa - is list of processors to communicate with */
523   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
524   for (i=0,j=0; i<size; i++) {
525     if (w1[i]) {pa[j] = i; j++;}
526   }
527 
528   /* Each message would have a header = 1 + 2*(no of IS) + data */
529   for (i=0; i<nrqs; i++) {
530     j      = pa[i];
531     w1[j] += w2[j] + 2*w3[j];
532     msz   += w1[j];
533   }
534 
535   /* Determine the number of messages to expect, their lengths, from from-ids */
536   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
537   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
538 
539   /* Now post the Irecvs corresponding to these messages */
540   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
541 
542   /* Allocate Memory for outgoing messages */
543   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
544   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
545   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
546 
547   {
548     PetscInt *iptr = tmp,ict  = 0;
549     for (i=0; i<nrqs; i++) {
550       j         = pa[i];
551       iptr     +=  ict;
552       outdat[j] = iptr;
553       ict       = w1[j];
554     }
555   }
556 
557   /* Form the outgoing messages */
558   /*plug in the headers*/
559   for (i=0; i<nrqs; i++) {
560     j            = pa[i];
561     outdat[j][0] = 0;
562     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
563     ptr[j]       = outdat[j] + 2*w3[j] + 1;
564   }
565 
566   /* Memory for doing local proc's work*/
567   {
568     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
569 
570     for (i=0; i<imax; i++) {
571       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
572       data[i]  = d_p + M*i;
573     }
574   }
575 
576   /* Parse the IS and update local tables and the outgoing buf with the data*/
577   {
578     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
579     PetscBT  table_i;
580 
581     for (i=0; i<imax; i++) {
582       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
583       n_i     = n[i];
584       table_i = table[i];
585       idx_i   = idx[i];
586       data_i  = data[i];
587       isz_i   = isz[i];
588       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
589         row  = idx_i[j];
590         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
591         if (proc != rank) { /* copy to the outgoing buffer */
592           ctr[proc]++;
593           *ptr[proc] = row;
594           ptr[proc]++;
595         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
596       }
597       /* Update the headers for the current IS */
598       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
599         if ((ctr_j = ctr[j])) {
600           outdat_j        = outdat[j];
601           k               = ++outdat_j[0];
602           outdat_j[2*k]   = ctr_j;
603           outdat_j[2*k-1] = i;
604         }
605       }
606       isz[i] = isz_i;
607     }
608   }
609 
610   /*  Now  post the sends */
611   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
612   for (i=0; i<nrqs; ++i) {
613     j    = pa[i];
614     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
615   }
616 
617   /* No longer need the original indices*/
618   for (i=0; i<imax; ++i) {
619     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
620   }
621   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
622 
623   for (i=0; i<imax; ++i) {
624     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
625   }
626 
627   /* Do Local work*/
628   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
629 
630   /* Receive messages*/
631   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
632   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
633 
634   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
635   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
636 
637   /* Phase 1 sends are complete - deallocate buffers */
638   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
639   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
640 
641   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
642   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
643   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
644   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
645   ierr = PetscFree(rbuf);CHKERRQ(ierr);
646 
647 
648   /* Send the data back*/
649   /* Do a global reduction to know the buffer space req for incoming messages*/
650   {
651     PetscMPIInt *rw1;
652 
653     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
654 
655     for (i=0; i<nrqr; ++i) {
656       proc = recv_status[i].MPI_SOURCE;
657 
658       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
659       rw1[proc] = isz1[i];
660     }
661     ierr = PetscFree(onodes1);CHKERRQ(ierr);
662     ierr = PetscFree(olengths1);CHKERRQ(ierr);
663 
664     /* Determine the number of messages to expect, their lengths, from from-ids */
665     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
666     ierr = PetscFree(rw1);CHKERRQ(ierr);
667   }
668   /* Now post the Irecvs corresponding to these messages */
669   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
670 
671   /*  Now  post the sends */
672   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
673   for (i=0; i<nrqr; ++i) {
674     j    = recv_status[i].MPI_SOURCE;
675     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
676   }
677 
678   /* receive work done on other processors*/
679   {
680     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
681     PetscMPIInt idex;
682     PetscBT     table_i;
683     MPI_Status  *status2;
684 
685     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
686     for (i=0; i<nrqs; ++i) {
687       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
688       /* Process the message*/
689       rbuf2_i = rbuf2[idex];
690       ct1     = 2*rbuf2_i[0]+1;
691       jmax    = rbuf2[idex][0];
692       for (j=1; j<=jmax; j++) {
693         max     = rbuf2_i[2*j];
694         is_no   = rbuf2_i[2*j-1];
695         isz_i   = isz[is_no];
696         data_i  = data[is_no];
697         table_i = table[is_no];
698         for (k=0; k<max; k++,ct1++) {
699           row = rbuf2_i[ct1];
700           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
701         }
702         isz[is_no] = isz_i;
703       }
704     }
705 
706     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
707     ierr = PetscFree(status2);CHKERRQ(ierr);
708   }
709 
710   for (i=0; i<imax; ++i) {
711     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
712   }
713 
714   ierr = PetscFree(onodes2);CHKERRQ(ierr);
715   ierr = PetscFree(olengths2);CHKERRQ(ierr);
716 
717   ierr = PetscFree(pa);CHKERRQ(ierr);
718   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
719   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
720   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
721   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
722   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
723   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
724   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
725   ierr = PetscFree(s_status);CHKERRQ(ierr);
726   ierr = PetscFree(recv_status);CHKERRQ(ierr);
727   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
728   ierr = PetscFree(xdata);CHKERRQ(ierr);
729   ierr = PetscFree(isz1);CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 #undef __FUNCT__
734 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
735 /*
736    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
737        the work on the local processor.
738 
739      Inputs:
740       C      - MAT_MPIAIJ;
741       imax - total no of index sets processed at a time;
742       table  - an array of char - size = m bits.
743 
744      Output:
745       isz    - array containing the count of the solution elements corresponding
746                to each index set;
747       data   - pointer to the solutions
748 */
749 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
750 {
751   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
752   Mat        A  = c->A,B = c->B;
753   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
754   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
755   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
756   PetscBT    table_i;
757 
758   PetscFunctionBegin;
759   rstart = C->rmap->rstart;
760   cstart = C->cmap->rstart;
761   ai     = a->i;
762   aj     = a->j;
763   bi     = b->i;
764   bj     = b->j;
765   garray = c->garray;
766 
767 
768   for (i=0; i<imax; i++) {
769     data_i  = data[i];
770     table_i = table[i];
771     isz_i   = isz[i];
772     for (j=0,max=isz[i]; j<max; j++) {
773       row   = data_i[j] - rstart;
774       start = ai[row];
775       end   = ai[row+1];
776       for (k=start; k<end; k++) { /* Amat */
777         val = aj[k] + cstart;
778         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
779       }
780       start = bi[row];
781       end   = bi[row+1];
782       for (k=start; k<end; k++) { /* Bmat */
783         val = garray[bj[k]];
784         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
785       }
786     }
787     isz[i] = isz_i;
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 #undef __FUNCT__
793 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
794 /*
795       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
796          and return the output
797 
798          Input:
799            C    - the matrix
800            nrqr - no of messages being processed.
801            rbuf - an array of pointers to the recieved requests
802 
803          Output:
804            xdata - array of messages to be sent back
805            isz1  - size of each message
806 
807   For better efficiency perhaps we should malloc separately each xdata[i],
808 then if a remalloc is required we need only copy the data for that one row
809 rather then all previous rows as it is now where a single large chunck of
810 memory is used.
811 
812 */
813 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
814 {
815   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
816   Mat            A  = c->A,B = c->B;
817   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
818   PetscErrorCode ierr;
819   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
820   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
821   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
822   PetscInt       *rbuf_i,kmax,rbuf_0;
823   PetscBT        xtable;
824 
825   PetscFunctionBegin;
826   m      = C->rmap->N;
827   rstart = C->rmap->rstart;
828   cstart = C->cmap->rstart;
829   ai     = a->i;
830   aj     = a->j;
831   bi     = b->i;
832   bj     = b->j;
833   garray = c->garray;
834 
835 
836   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
837     rbuf_i =  rbuf[i];
838     rbuf_0 =  rbuf_i[0];
839     ct    += rbuf_0;
840     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
841   }
842 
843   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
844   else max1 = 1;
845   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
846   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
847   ++no_malloc;
848   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
849   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
850 
851   ct3 = 0;
852   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
853     rbuf_i =  rbuf[i];
854     rbuf_0 =  rbuf_i[0];
855     ct1    =  2*rbuf_0+1;
856     ct2    =  ct1;
857     ct3   += ct1;
858     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
859       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
860       oct2 = ct2;
861       kmax = rbuf_i[2*j];
862       for (k=0; k<kmax; k++,ct1++) {
863         row = rbuf_i[ct1];
864         if (!PetscBTLookupSet(xtable,row)) {
865           if (!(ct3 < mem_estimate)) {
866             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
867             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
868             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
869             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
870             xdata[0]     = tmp;
871             mem_estimate = new_estimate; ++no_malloc;
872             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
873           }
874           xdata[i][ct2++] = row;
875           ct3++;
876         }
877       }
878       for (k=oct2,max2=ct2; k<max2; k++) {
879         row   = xdata[i][k] - rstart;
880         start = ai[row];
881         end   = ai[row+1];
882         for (l=start; l<end; l++) {
883           val = aj[l] + cstart;
884           if (!PetscBTLookupSet(xtable,val)) {
885             if (!(ct3 < mem_estimate)) {
886               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
887               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
888               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
889               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
890               xdata[0]     = tmp;
891               mem_estimate = new_estimate; ++no_malloc;
892               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
893             }
894             xdata[i][ct2++] = val;
895             ct3++;
896           }
897         }
898         start = bi[row];
899         end   = bi[row+1];
900         for (l=start; l<end; l++) {
901           val = garray[bj[l]];
902           if (!PetscBTLookupSet(xtable,val)) {
903             if (!(ct3 < mem_estimate)) {
904               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
905               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
906               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
907               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
908               xdata[0]     = tmp;
909               mem_estimate = new_estimate; ++no_malloc;
910               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
911             }
912             xdata[i][ct2++] = val;
913             ct3++;
914           }
915         }
916       }
917       /* Update the header*/
918       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
919       xdata[i][2*j-1] = rbuf_i[2*j-1];
920     }
921     xdata[i][0] = rbuf_0;
922     xdata[i+1]  = xdata[i] + ct2;
923     isz1[i]     = ct2; /* size of each message */
924   }
925   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
926   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
927   PetscFunctionReturn(0);
928 }
929 /* -------------------------------------------------------------------------*/
930 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
931 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
932 /*
933     Every processor gets the entire matrix
934 */
935 #undef __FUNCT__
936 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
937 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
938 {
939   Mat            B;
940   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
941   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
942   PetscErrorCode ierr;
943   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
944   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
945   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
946   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
947 
948   PetscFunctionBegin;
949   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
950   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
951 
952   if (scall == MAT_INITIAL_MATRIX) {
953     /* ----------------------------------------------------------------
954          Tell every processor the number of nonzeros per row
955     */
956     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
957     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
958       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
959     }
960     sendcount = A->rmap->rend - A->rmap->rstart;
961     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
962     for (i=0; i<size; i++) {
963       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
964       displs[i]     = A->rmap->range[i];
965     }
966 #if defined(PETSC_HAVE_MPI_IN_PLACE)
967     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
968 #else
969     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
970 #endif
971     /* ---------------------------------------------------------------
972          Create the sequential matrix of the same type as the local block diagonal
973     */
974     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
975     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
976     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
977     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
978     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
979     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
980     **Bin = B;
981     b     = (Mat_SeqAIJ*)B->data;
982 
983     /*--------------------------------------------------------------------
984        Copy my part of matrix column indices over
985     */
986     sendcount  = ad->nz + bd->nz;
987     jsendbuf   = b->j + b->i[rstarts[rank]];
988     a_jsendbuf = ad->j;
989     b_jsendbuf = bd->j;
990     n          = A->rmap->rend - A->rmap->rstart;
991     cnt        = 0;
992     for (i=0; i<n; i++) {
993 
994       /* put in lower diagonal portion */
995       m = bd->i[i+1] - bd->i[i];
996       while (m > 0) {
997         /* is it above diagonal (in bd (compressed) numbering) */
998         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
999         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1000         m--;
1001       }
1002 
1003       /* put in diagonal portion */
1004       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1005         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1006       }
1007 
1008       /* put in upper diagonal portion */
1009       while (m-- > 0) {
1010         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1011       }
1012     }
1013     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1014 
1015     /*--------------------------------------------------------------------
1016        Gather all column indices to all processors
1017     */
1018     for (i=0; i<size; i++) {
1019       recvcounts[i] = 0;
1020       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1021         recvcounts[i] += lens[j];
1022       }
1023     }
1024     displs[0] = 0;
1025     for (i=1; i<size; i++) {
1026       displs[i] = displs[i-1] + recvcounts[i-1];
1027     }
1028 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1029     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1030 #else
1031     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1032 #endif
1033     /*--------------------------------------------------------------------
1034         Assemble the matrix into useable form (note numerical values not yet set)
1035     */
1036     /* set the b->ilen (length of each row) values */
1037     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1038     /* set the b->i indices */
1039     b->i[0] = 0;
1040     for (i=1; i<=A->rmap->N; i++) {
1041       b->i[i] = b->i[i-1] + lens[i-1];
1042     }
1043     ierr = PetscFree(lens);CHKERRQ(ierr);
1044     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1045     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1046 
1047   } else {
1048     B = **Bin;
1049     b = (Mat_SeqAIJ*)B->data;
1050   }
1051 
1052   /*--------------------------------------------------------------------
1053        Copy my part of matrix numerical values into the values location
1054   */
1055   if (flag == MAT_GET_VALUES) {
1056     sendcount = ad->nz + bd->nz;
1057     sendbuf   = b->a + b->i[rstarts[rank]];
1058     a_sendbuf = ad->a;
1059     b_sendbuf = bd->a;
1060     b_sendj   = bd->j;
1061     n         = A->rmap->rend - A->rmap->rstart;
1062     cnt       = 0;
1063     for (i=0; i<n; i++) {
1064 
1065       /* put in lower diagonal portion */
1066       m = bd->i[i+1] - bd->i[i];
1067       while (m > 0) {
1068         /* is it above diagonal (in bd (compressed) numbering) */
1069         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1070         sendbuf[cnt++] = *b_sendbuf++;
1071         m--;
1072         b_sendj++;
1073       }
1074 
1075       /* put in diagonal portion */
1076       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1077         sendbuf[cnt++] = *a_sendbuf++;
1078       }
1079 
1080       /* put in upper diagonal portion */
1081       while (m-- > 0) {
1082         sendbuf[cnt++] = *b_sendbuf++;
1083         b_sendj++;
1084       }
1085     }
1086     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1087 
1088     /* -----------------------------------------------------------------
1089        Gather all numerical values to all processors
1090     */
1091     if (!recvcounts) {
1092       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1093     }
1094     for (i=0; i<size; i++) {
1095       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1096     }
1097     displs[0] = 0;
1098     for (i=1; i<size; i++) {
1099       displs[i] = displs[i-1] + recvcounts[i-1];
1100     }
1101     recvbuf = b->a;
1102 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1103     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1104 #else
1105     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1106 #endif
1107   }  /* endof (flag == MAT_GET_VALUES) */
1108   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1109 
1110   if (A->symmetric) {
1111     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1112   } else if (A->hermitian) {
1113     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1114   } else if (A->structurally_symmetric) {
1115     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1116   }
1117   PetscFunctionReturn(0);
1118 }
1119 
1120 
1121 
1122 #undef __FUNCT__
1123 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
1124 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1125 {
1126   PetscErrorCode ierr;
1127   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1128   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1129 
1130   PetscFunctionBegin;
1131   /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
1132   /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level.
1133      However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that
1134      take care to order the result correctly by assembling it with MatSetValues() (after preallocating).
1135    */
1136   for (i = 0; i < ismax; ++i) {
1137     PetscBool sorted;
1138     ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr);
1139     if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i);
1140   }
1141 
1142   /*
1143        Check for special case: each processor gets entire matrix
1144   */
1145   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1146     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1147     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1148     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1149     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1150     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1151       wantallmatrix = PETSC_TRUE;
1152 
1153       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1154     }
1155   }
1156   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1157   if (twantallmatrix) {
1158     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
1159     PetscFunctionReturn(0);
1160   }
1161 
1162   /* Allocate memory to hold all the submatrices */
1163   if (scall != MAT_REUSE_MATRIX) {
1164     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
1165   }
1166 
1167   /* Check for special case: each processor gets entire matrix columns */
1168   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
1169   for (i=0; i<ismax; i++) {
1170     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
1171     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
1172     if (colflag && ncol == C->cmap->N) {
1173       allcolumns[i] = PETSC_TRUE;
1174     } else {
1175       allcolumns[i] = PETSC_FALSE;
1176     }
1177   }
1178 
1179   /* Determine the number of stages through which submatrices are done */
1180   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1181 
1182   /*
1183      Each stage will extract nmax submatrices.
1184      nmax is determined by the matrix column dimension.
1185      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1186   */
1187   if (!nmax) nmax = 1;
1188   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1189 
1190   /* Make sure every processor loops through the nstages */
1191   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1192 
1193   for (i=0,pos=0; i<nstages; i++) {
1194     if (pos+nmax <= ismax) max_no = nmax;
1195     else if (pos == ismax) max_no = 0;
1196     else                   max_no = ismax-pos;
1197     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
1198     pos += max_no;
1199   }
1200 
1201   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
1202   PetscFunctionReturn(0);
1203 }
1204 
1205 /* -------------------------------------------------------------------------*/
1206 #undef __FUNCT__
1207 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
1208 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1209 {
1210   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1211   Mat            A  = c->A;
1212   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1213   const PetscInt **icol,**irow;
1214   PetscInt       *nrow,*ncol,start;
1215   PetscErrorCode ierr;
1216   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1217   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1218   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1219   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1220   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1221 #if defined(PETSC_USE_CTABLE)
1222   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1223 #else
1224   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1225 #endif
1226   const PetscInt *irow_i;
1227   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1228   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1229   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1230   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1231   MPI_Status     *r_status3,*r_status4,*s_status4;
1232   MPI_Comm       comm;
1233   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1234   PetscMPIInt    *onodes1,*olengths1;
1235   PetscMPIInt    idex,idex2,end;
1236 
1237   PetscFunctionBegin;
1238   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1239   tag0 = ((PetscObject)C)->tag;
1240   size = c->size;
1241   rank = c->rank;
1242 
1243   /* Get some new tags to keep the communication clean */
1244   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1245   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1246   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1247 
1248   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
1249 
1250   for (i=0; i<ismax; i++) {
1251     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
1252     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
1253     if (allcolumns[i]) {
1254       icol[i] = NULL;
1255       ncol[i] = C->cmap->N;
1256     } else {
1257       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
1258       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
1259     }
1260   }
1261 
1262   /* evaluate communication - mesg to who, length of mesg, and buffer space
1263      required. Based on this, buffers are allocated, and data copied into them*/
1264   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
1265   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1266   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1267   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1268   for (i=0; i<ismax; i++) {
1269     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
1270     jmax   = nrow[i];
1271     irow_i = irow[i];
1272     for (j=0; j<jmax; j++) {
1273       l   = 0;
1274       row = irow_i[j];
1275       while (row >= C->rmap->range[l+1]) l++;
1276       proc = l;
1277       w4[proc]++;
1278     }
1279     for (j=0; j<size; j++) {
1280       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1281     }
1282   }
1283 
1284   nrqs     = 0;              /* no of outgoing messages */
1285   msz      = 0;              /* total mesg length (for all procs) */
1286   w1[rank] = 0;              /* no mesg sent to self */
1287   w3[rank] = 0;
1288   for (i=0; i<size; i++) {
1289     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1290   }
1291   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1292   for (i=0,j=0; i<size; i++) {
1293     if (w1[i]) { pa[j] = i; j++; }
1294   }
1295 
1296   /* Each message would have a header = 1 + 2*(no of IS) + data */
1297   for (i=0; i<nrqs; i++) {
1298     j      = pa[i];
1299     w1[j] += w2[j] + 2* w3[j];
1300     msz   += w1[j];
1301   }
1302   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1303 
1304   /* Determine the number of messages to expect, their lengths, from from-ids */
1305   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1306   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1307 
1308   /* Now post the Irecvs corresponding to these messages */
1309   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1310 
1311   ierr = PetscFree(onodes1);CHKERRQ(ierr);
1312   ierr = PetscFree(olengths1);CHKERRQ(ierr);
1313 
1314   /* Allocate Memory for outgoing messages */
1315   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1316   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1317   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1318 
1319   {
1320     PetscInt *iptr = tmp,ict = 0;
1321     for (i=0; i<nrqs; i++) {
1322       j        = pa[i];
1323       iptr    += ict;
1324       sbuf1[j] = iptr;
1325       ict      = w1[j];
1326     }
1327   }
1328 
1329   /* Form the outgoing messages */
1330   /* Initialize the header space */
1331   for (i=0; i<nrqs; i++) {
1332     j           = pa[i];
1333     sbuf1[j][0] = 0;
1334     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
1335     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1336   }
1337 
1338   /* Parse the isrow and copy data into outbuf */
1339   for (i=0; i<ismax; i++) {
1340     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1341     irow_i = irow[i];
1342     jmax   = nrow[i];
1343     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1344       l   = 0;
1345       row = irow_i[j];
1346       while (row >= C->rmap->range[l+1]) l++;
1347       proc = l;
1348       if (proc != rank) { /* copy to the outgoing buf*/
1349         ctr[proc]++;
1350         *ptr[proc] = row;
1351         ptr[proc]++;
1352       }
1353     }
1354     /* Update the headers for the current IS */
1355     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1356       if ((ctr_j = ctr[j])) {
1357         sbuf1_j        = sbuf1[j];
1358         k              = ++sbuf1_j[0];
1359         sbuf1_j[2*k]   = ctr_j;
1360         sbuf1_j[2*k-1] = i;
1361       }
1362     }
1363   }
1364 
1365   /*  Now  post the sends */
1366   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1367   for (i=0; i<nrqs; ++i) {
1368     j    = pa[i];
1369     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
1370   }
1371 
1372   /* Post Receives to capture the buffer size */
1373   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
1374   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
1375   rbuf2[0] = tmp + msz;
1376   for (i=1; i<nrqs; ++i) {
1377     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1378   }
1379   for (i=0; i<nrqs; ++i) {
1380     j    = pa[i];
1381     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
1382   }
1383 
1384   /* Send to other procs the buf size they should allocate */
1385 
1386 
1387   /* Receive messages*/
1388   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
1389   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1390   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
1391   {
1392     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1393     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1394     PetscInt   *sbuf2_i;
1395 
1396     for (i=0; i<nrqr; ++i) {
1397       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1398 
1399       req_size[idex] = 0;
1400       rbuf1_i        = rbuf1[idex];
1401       start          = 2*rbuf1_i[0] + 1;
1402       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1403       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
1404       sbuf2_i        = sbuf2[idex];
1405       for (j=start; j<end; j++) {
1406         id              = rbuf1_i[j] - rstart;
1407         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1408         sbuf2_i[j]      = ncols;
1409         req_size[idex] += ncols;
1410       }
1411       req_source[idex] = r_status1[i].MPI_SOURCE;
1412       /* form the header */
1413       sbuf2_i[0] = req_size[idex];
1414       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1415 
1416       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1417     }
1418   }
1419   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1420   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1421 
1422   /*  recv buffer sizes */
1423   /* Receive messages*/
1424 
1425   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1426   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1427   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1428   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1429   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1430 
1431   for (i=0; i<nrqs; ++i) {
1432     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1433     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1434     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1435     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1436     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1437   }
1438   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1439   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1440 
1441   /* Wait on sends1 and sends2 */
1442   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1443   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1444 
1445   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1446   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1447   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1448   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1449   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1450   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1451 
1452   /* Now allocate buffers for a->j, and send them off */
1453   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1454   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1455   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1456   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1457 
1458   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1459   {
1460     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1461     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1462     PetscInt cend = C->cmap->rend;
1463     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1464 
1465     for (i=0; i<nrqr; i++) {
1466       rbuf1_i   = rbuf1[i];
1467       sbuf_aj_i = sbuf_aj[i];
1468       ct1       = 2*rbuf1_i[0] + 1;
1469       ct2       = 0;
1470       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1471         kmax = rbuf1[i][2*j];
1472         for (k=0; k<kmax; k++,ct1++) {
1473           row    = rbuf1_i[ct1] - rstart;
1474           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1475           ncols  = nzA + nzB;
1476           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1477 
1478           /* load the column indices for this row into cols*/
1479           cols = sbuf_aj_i + ct2;
1480 
1481           lwrite = 0;
1482           for (l=0; l<nzB; l++) {
1483             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1484           }
1485           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1486           for (l=0; l<nzB; l++) {
1487             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1488           }
1489 
1490           ct2 += ncols;
1491         }
1492       }
1493       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1494     }
1495   }
1496   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1497   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1498 
1499   /* Allocate buffers for a->a, and send them off */
1500   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1501   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1502   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1503   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1504 
1505   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1506   {
1507     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1508     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1509     PetscInt    cend   = C->cmap->rend;
1510     PetscInt    *b_j   = b->j;
1511     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1512 
1513     for (i=0; i<nrqr; i++) {
1514       rbuf1_i   = rbuf1[i];
1515       sbuf_aa_i = sbuf_aa[i];
1516       ct1       = 2*rbuf1_i[0]+1;
1517       ct2       = 0;
1518       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1519         kmax = rbuf1_i[2*j];
1520         for (k=0; k<kmax; k++,ct1++) {
1521           row    = rbuf1_i[ct1] - rstart;
1522           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1523           ncols  = nzA + nzB;
1524           cworkB = b_j + b_i[row];
1525           vworkA = a_a + a_i[row];
1526           vworkB = b_a + b_i[row];
1527 
1528           /* load the column values for this row into vals*/
1529           vals = sbuf_aa_i+ct2;
1530 
1531           lwrite = 0;
1532           for (l=0; l<nzB; l++) {
1533             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1534           }
1535           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1536           for (l=0; l<nzB; l++) {
1537             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1538           }
1539 
1540           ct2 += ncols;
1541         }
1542       }
1543       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1544     }
1545   }
1546   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1547   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1548   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1549   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1550 
1551   /* Form the matrix */
1552   /* create col map: global col of C -> local col of submatrices */
1553   {
1554     const PetscInt *icol_i;
1555 #if defined(PETSC_USE_CTABLE)
1556     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1557     for (i=0; i<ismax; i++) {
1558       if (!allcolumns[i]) {
1559         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1560 
1561         jmax   = ncol[i];
1562         icol_i = icol[i];
1563         cmap_i = cmap[i];
1564         for (j=0; j<jmax; j++) {
1565           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1566         }
1567       } else {
1568         cmap[i] = NULL;
1569       }
1570     }
1571 #else
1572     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1573     for (i=0; i<ismax; i++) {
1574       if (!allcolumns[i]) {
1575         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1576         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1577         jmax   = ncol[i];
1578         icol_i = icol[i];
1579         cmap_i = cmap[i];
1580         for (j=0; j<jmax; j++) {
1581           cmap_i[icol_i[j]] = j+1;
1582         }
1583       } else {
1584         cmap[i] = NULL;
1585       }
1586     }
1587 #endif
1588   }
1589 
1590   /* Create lens which is required for MatCreate... */
1591   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1592   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1593   if (ismax) {
1594     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1595     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1596   }
1597   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1598 
1599   /* Update lens from local data */
1600   for (i=0; i<ismax; i++) {
1601     jmax = nrow[i];
1602     if (!allcolumns[i]) cmap_i = cmap[i];
1603     irow_i = irow[i];
1604     lens_i = lens[i];
1605     for (j=0; j<jmax; j++) {
1606       l   = 0;
1607       row = irow_i[j];
1608       while (row >= C->rmap->range[l+1]) l++;
1609       proc = l;
1610       if (proc == rank) {
1611         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1612         if (!allcolumns[i]) {
1613           for (k=0; k<ncols; k++) {
1614 #if defined(PETSC_USE_CTABLE)
1615             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1616 #else
1617             tcol = cmap_i[cols[k]];
1618 #endif
1619             if (tcol) lens_i[j]++;
1620           }
1621         } else { /* allcolumns */
1622           lens_i[j] = ncols;
1623         }
1624         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1625       }
1626     }
1627   }
1628 
1629   /* Create row map: global row of C -> local row of submatrices */
1630 #if defined(PETSC_USE_CTABLE)
1631   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1632   for (i=0; i<ismax; i++) {
1633     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1634     rmap_i = rmap[i];
1635     irow_i = irow[i];
1636     jmax   = nrow[i];
1637     for (j=0; j<jmax; j++) {
1638       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1639     }
1640   }
1641 #else
1642   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1643   if (ismax) {
1644     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1645     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1646   }
1647   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1648   for (i=0; i<ismax; i++) {
1649     rmap_i = rmap[i];
1650     irow_i = irow[i];
1651     jmax   = nrow[i];
1652     for (j=0; j<jmax; j++) {
1653       rmap_i[irow_i[j]] = j;
1654     }
1655   }
1656 #endif
1657 
1658   /* Update lens from offproc data */
1659   {
1660     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1661 
1662     for (tmp2=0; tmp2<nrqs; tmp2++) {
1663       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1664       idex    = pa[idex2];
1665       sbuf1_i = sbuf1[idex];
1666       jmax    = sbuf1_i[0];
1667       ct1     = 2*jmax+1;
1668       ct2     = 0;
1669       rbuf2_i = rbuf2[idex2];
1670       rbuf3_i = rbuf3[idex2];
1671       for (j=1; j<=jmax; j++) {
1672         is_no  = sbuf1_i[2*j-1];
1673         max1   = sbuf1_i[2*j];
1674         lens_i = lens[is_no];
1675         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1676         rmap_i = rmap[is_no];
1677         for (k=0; k<max1; k++,ct1++) {
1678 #if defined(PETSC_USE_CTABLE)
1679           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1680           row--;
1681           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1682 #else
1683           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1684 #endif
1685           max2 = rbuf2_i[ct1];
1686           for (l=0; l<max2; l++,ct2++) {
1687             if (!allcolumns[is_no]) {
1688 #if defined(PETSC_USE_CTABLE)
1689               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1690 #else
1691               tcol = cmap_i[rbuf3_i[ct2]];
1692 #endif
1693               if (tcol) lens_i[row]++;
1694             } else { /* allcolumns */
1695               lens_i[row]++; /* lens_i[row] += max2 ? */
1696             }
1697           }
1698         }
1699       }
1700     }
1701   }
1702   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1703   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1704   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1705   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1706   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1707 
1708   /* Create the submatrices */
1709   if (scall == MAT_REUSE_MATRIX) {
1710     PetscBool flag;
1711 
1712     /*
1713         Assumes new rows are same length as the old rows,hence bug!
1714     */
1715     for (i=0; i<ismax; i++) {
1716       mat = (Mat_SeqAIJ*)(submats[i]->data);
1717       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1718       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1719       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1720       /* Initial matrix as if empty */
1721       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1722 
1723       submats[i]->factortype = C->factortype;
1724     }
1725   } else {
1726     for (i=0; i<ismax; i++) {
1727       PetscInt rbs,cbs;
1728       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1729       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1730 
1731       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1732       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1733 
1734       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1735       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1736       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1737     }
1738   }
1739 
1740   /* Assemble the matrices */
1741   /* First assemble the local rows */
1742   {
1743     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1744     PetscScalar *imat_a;
1745 
1746     for (i=0; i<ismax; i++) {
1747       mat       = (Mat_SeqAIJ*)submats[i]->data;
1748       imat_ilen = mat->ilen;
1749       imat_j    = mat->j;
1750       imat_i    = mat->i;
1751       imat_a    = mat->a;
1752 
1753       if (!allcolumns[i]) cmap_i = cmap[i];
1754       rmap_i = rmap[i];
1755       irow_i = irow[i];
1756       jmax   = nrow[i];
1757       for (j=0; j<jmax; j++) {
1758         l   = 0;
1759         row = irow_i[j];
1760         while (row >= C->rmap->range[l+1]) l++;
1761         proc = l;
1762         if (proc == rank) {
1763           old_row = row;
1764 #if defined(PETSC_USE_CTABLE)
1765           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1766           row--;
1767 #else
1768           row = rmap_i[row];
1769 #endif
1770           ilen_row = imat_ilen[row];
1771           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1772           mat_i    = imat_i[row];
1773           mat_a    = imat_a + mat_i;
1774           mat_j    = imat_j + mat_i;
1775           if (!allcolumns[i]) {
1776             for (k=0; k<ncols; k++) {
1777 #if defined(PETSC_USE_CTABLE)
1778               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1779 #else
1780               tcol = cmap_i[cols[k]];
1781 #endif
1782               if (tcol) {
1783                 *mat_j++ = tcol - 1;
1784                 *mat_a++ = vals[k];
1785                 ilen_row++;
1786               }
1787             }
1788           } else { /* allcolumns */
1789             for (k=0; k<ncols; k++) {
1790               *mat_j++ = cols[k];  /* global col index! */
1791               *mat_a++ = vals[k];
1792               ilen_row++;
1793             }
1794           }
1795           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1796 
1797           imat_ilen[row] = ilen_row;
1798         }
1799       }
1800     }
1801   }
1802 
1803   /*   Now assemble the off proc rows*/
1804   {
1805     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1806     PetscInt    *imat_j,*imat_i;
1807     PetscScalar *imat_a,*rbuf4_i;
1808 
1809     for (tmp2=0; tmp2<nrqs; tmp2++) {
1810       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1811       idex    = pa[idex2];
1812       sbuf1_i = sbuf1[idex];
1813       jmax    = sbuf1_i[0];
1814       ct1     = 2*jmax + 1;
1815       ct2     = 0;
1816       rbuf2_i = rbuf2[idex2];
1817       rbuf3_i = rbuf3[idex2];
1818       rbuf4_i = rbuf4[idex2];
1819       for (j=1; j<=jmax; j++) {
1820         is_no     = sbuf1_i[2*j-1];
1821         rmap_i    = rmap[is_no];
1822         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1823         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1824         imat_ilen = mat->ilen;
1825         imat_j    = mat->j;
1826         imat_i    = mat->i;
1827         imat_a    = mat->a;
1828         max1      = sbuf1_i[2*j];
1829         for (k=0; k<max1; k++,ct1++) {
1830           row = sbuf1_i[ct1];
1831 #if defined(PETSC_USE_CTABLE)
1832           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1833           row--;
1834 #else
1835           row = rmap_i[row];
1836 #endif
1837           ilen  = imat_ilen[row];
1838           mat_i = imat_i[row];
1839           mat_a = imat_a + mat_i;
1840           mat_j = imat_j + mat_i;
1841           max2  = rbuf2_i[ct1];
1842           if (!allcolumns[is_no]) {
1843             for (l=0; l<max2; l++,ct2++) {
1844 
1845 #if defined(PETSC_USE_CTABLE)
1846               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1847 #else
1848               tcol = cmap_i[rbuf3_i[ct2]];
1849 #endif
1850               if (tcol) {
1851                 *mat_j++ = tcol - 1;
1852                 *mat_a++ = rbuf4_i[ct2];
1853                 ilen++;
1854               }
1855             }
1856           } else { /* allcolumns */
1857             for (l=0; l<max2; l++,ct2++) {
1858               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1859               *mat_a++ = rbuf4_i[ct2];
1860               ilen++;
1861             }
1862           }
1863           imat_ilen[row] = ilen;
1864         }
1865       }
1866     }
1867   }
1868 
1869   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1870   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1871   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1872   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1873   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1874 
1875   /* Restore the indices */
1876   for (i=0; i<ismax; i++) {
1877     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1878     if (!allcolumns[i]) {
1879       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1880     }
1881   }
1882 
1883   /* Destroy allocated memory */
1884   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1885   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1886   ierr = PetscFree(pa);CHKERRQ(ierr);
1887 
1888   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1889   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1890   for (i=0; i<nrqr; ++i) {
1891     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1892   }
1893   for (i=0; i<nrqs; ++i) {
1894     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1895     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1896   }
1897 
1898   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1899   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1900   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1901   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1902   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1903   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1904   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1905 
1906 #if defined(PETSC_USE_CTABLE)
1907   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1908 #else
1909   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1910 #endif
1911   ierr = PetscFree(rmap);CHKERRQ(ierr);
1912 
1913   for (i=0; i<ismax; i++) {
1914     if (!allcolumns[i]) {
1915 #if defined(PETSC_USE_CTABLE)
1916       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1917 #else
1918       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1919 #endif
1920     }
1921   }
1922   ierr = PetscFree(cmap);CHKERRQ(ierr);
1923   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1924   ierr = PetscFree(lens);CHKERRQ(ierr);
1925 
1926   for (i=0; i<ismax; i++) {
1927     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1928     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1929   }
1930   PetscFunctionReturn(0);
1931 }
1932 
1933 /*
1934  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1935  Be careful not to destroy them elsewhere.
1936  */
1937 #undef __FUNCT__
1938 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1939 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1940 {
1941   /* If making this function public, change the error returned in this function away from _PLIB. */
1942   PetscErrorCode ierr;
1943   Mat_MPIAIJ     *aij;
1944   PetscBool      seqaij;
1945 
1946   PetscFunctionBegin;
1947   /* Check to make sure the component matrices are compatible with C. */
1948   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1949   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1950   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1951   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1952   if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1953 
1954   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1955   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1956   ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr);
1957   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1958   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1959   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1960   ierr   = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1961   aij    = (Mat_MPIAIJ*)((*C)->data);
1962   aij->A = A;
1963   aij->B = B;
1964   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr);
1965   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr);
1966 
1967   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1968   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1969   PetscFunctionReturn(0);
1970 }
1971 
1972 #undef __FUNCT__
1973 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1974 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1975 {
1976   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1977 
1978   PetscFunctionBegin;
1979   PetscValidPointer(A,2);
1980   PetscValidPointer(B,3);
1981   *A = aij->A;
1982   *B = aij->B;
1983   /* Note that we don't incref *A and *B, so be careful! */
1984   PetscFunctionReturn(0);
1985 }
1986 
1987 #undef __FUNCT__
1988 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1989 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1990                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1991                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1992                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1993 {
1994   PetscErrorCode ierr;
1995   PetscMPIInt    size, flag;
1996   PetscInt       i,ii;
1997   PetscInt       ismax_c;
1998 
1999   PetscFunctionBegin;
2000   if (!ismax) PetscFunctionReturn(0);
2001 
2002   for (i = 0, ismax_c = 0; i < ismax; ++i) {
2003     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
2004     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2005     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2006     if (size > 1) ++ismax_c;
2007   }
2008   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
2009     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2010   } else { /* if (ismax_c) */
2011     Mat         *A,*B;
2012     IS          *isrow_c, *iscol_c;
2013     PetscMPIInt size;
2014     /*
2015      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2016      array of sequential matrices underlying the resulting parallel matrices.
2017      Which arrays to allocate is based on the value of MatReuse scall.
2018      There are as many diag matrices as there are original index sets.
2019      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
2020 
2021      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
2022      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
2023      will deposite the extracted diag and off-diag parts.
2024      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
2025      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
2026     */
2027 
2028     /* Parallel matrix array is allocated only if no reuse is taking place. */
2029     if (scall != MAT_REUSE_MATRIX) {
2030       ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
2031     } else {
2032       ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr);
2033       ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr);
2034       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
2035       for (i = 0, ii = 0; i < ismax; ++i) {
2036         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2037         if (size > 1) {
2038           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
2039           ++ii;
2040         } else A[i] = (*submat)[i];
2041       }
2042     }
2043     /*
2044      Construct the complements of the iscol ISs for parallel ISs only.
2045      These are used to extract the off-diag portion of the resulting parallel matrix.
2046      The row IS for the off-diag portion is the same as for the diag portion,
2047      so we merely alias the row IS, while skipping those that are sequential.
2048     */
2049     ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr);
2050     for (i = 0, ii = 0; i < ismax; ++i) {
2051       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2052       if (size > 1) {
2053         isrow_c[ii] = isrow[i];
2054 
2055         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
2056         ++ii;
2057       }
2058     }
2059     /* Now obtain the sequential A and B submatrices separately. */
2060     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
2061     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
2062     for (ii = 0; ii < ismax_c; ++ii) {
2063       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
2064     }
2065     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
2066     /*
2067      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
2068      have been extracted directly into the parallel matrices containing them, or
2069      simply into the sequential matrix identical with the corresponding A (if size == 1).
2070      Otherwise, make sure that parallel matrices are constructed from A & B, or the
2071      A is put into the correct submat slot (if size == 1).
2072      */
2073     if (scall != MAT_REUSE_MATRIX) {
2074       for (i = 0, ii = 0; i < ismax; ++i) {
2075         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
2076         if (size > 1) {
2077           /*
2078            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
2079            */
2080           /* Construct submat[i] from the Seq pieces A and B. */
2081           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
2082 
2083           ++ii;
2084         } else (*submat)[i] = A[i];
2085       }
2086     }
2087     ierr = PetscFree(A);CHKERRQ(ierr);
2088     ierr = PetscFree(B);CHKERRQ(ierr);
2089   }
2090   PetscFunctionReturn(0);
2091 } /* MatGetSubMatricesParallel_MPIXAIJ() */
2092 
2093 
2094 
2095 #undef __FUNCT__
2096 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
2097 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2098 {
2099   PetscErrorCode ierr;
2100 
2101   PetscFunctionBegin;
2102   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
2103   PetscFunctionReturn(0);
2104 }
2105