xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision a6404fbfb1cfbf30d2bac9856cef3bf7411483d5)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 #include <../src/mat/impls/aij/mpi/mpiaij.h>
8 #include <petscbt.h>
9 #include <petscsf.h>
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
20 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
21 
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
25 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
26 {
27   PetscErrorCode ierr;
28   PetscInt       i;
29 
30   PetscFunctionBegin;
31   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
32   for (i=0; i<ov; ++i) {
33     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
34   }
35   PetscFunctionReturn(0);
36 }
37 
38 #undef __FUNCT__
39 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
40 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
41 {
42   PetscErrorCode ierr;
43   PetscInt       i;
44 
45   PetscFunctionBegin;
46   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
47   for (i=0; i<ov; ++i) {
48     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
49   }
50   PetscFunctionReturn(0);
51 }
52 
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
57 {
58   PetscErrorCode   ierr;
59   MPI_Comm         comm;
60   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
61   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
62   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
63   const PetscInt  *indices_i,**indices;
64   PetscLayout      rmap;
65   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
66   PetscSF          sf;
67   PetscSFNode     *remote;
68 
69   PetscFunctionBegin;
70   /*communicator */
71   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
72   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
73   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
74   /*row map*/
75   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
76   /*retrieve IS data*/
77   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
78   /*get length and indices*/
79   for (i=0,tlength=0; i<nidx; i++){
80     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
81     tlength += length[i];
82     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
83   }
84   /*find these rows on remote processors */
85   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
86   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
87   nrrows = 0;
88   for (i=0; i<nidx; i++){
89     length_i     = length[i];
90     indices_i    = indices[i];
91     for (j=0; j<length_i; j++){
92       owner = -1;
93       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
94       /*remote processors*/
95       if (owner != rank){
96         tosizes_temp[owner]++; /*number of rows to owner*/
97         rrow_ranks[nrrows]  = owner; /*processor */
98         rrow_isids[nrrows]   = i; /*is id*/
99         remoterows[nrrows++] = indices_i[j]; /* row */
100       }
101     }
102     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
103   }
104   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
105   /*test if we need to exchange messages
106    * generally speaking, we do not need to exchange
107    * data when overlap is 1
108    * */
109   ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
110   /*we do not have any messages
111    * It usually corresponds to overlap 1
112    * */
113   if (!reducednrrows){
114     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
115     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
116     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
117     PetscFunctionReturn(0);
118   }
119   nto = 0;
120   /*send sizes and ranks*/
121   for (i=0; i<size; i++){
122    if (tosizes_temp[i]){
123      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
124      tosizes_temp[i] = nto; /* a map from processor to index */
125      toranks[nto++]  = i; /* processor */
126    }
127   }
128   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
129   for (i=0; i<nto; i++){
130     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /*offsets*/
131     tosizes[2*i+1] = toffsets[i]; /*offsets to send*/
132   }
133   /*send information to other processors*/
134   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
135   /*build a star forest */
136   nrecvrows = 0;
137   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
138   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
139   nrecvrows = 0;
140   for (i=0; i<nfrom; i++){
141     for (j=0; j<fromsizes[2*i]; j++){
142       remote[nrecvrows].rank    = fromranks[i];
143       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
144     }
145   }
146   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
147   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
148   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
149   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
150   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
151   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
152   /*message pair <no of is, row> */
153   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
154   for (i=0; i<nrrows; i++){
155     owner = rrow_ranks[i]; /* processor */
156     j     = tosizes_temp[owner]; /* index */
157     todata[toffsets[j]++] = rrow_isids[i];
158     todata[toffsets[j]++] = remoterows[i];
159   }
160   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
161   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
162   ierr = PetscFree(toffsets);CHKERRQ(ierr);
163   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
164   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
165   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
166   /*deal with remote data */
167   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
168   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
169   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
170   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
171   ierr = PetscFree(fromranks);CHKERRQ(ierr);
172   nrecvrows = 0;
173   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
174   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
175   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
176   nrecvrows = 0;
177   for (i=0; i<nto; i++){
178     for (j=0; j<tosizes[2*i]; j++){
179       remote[nrecvrows].rank    = toranks[i];
180       remote[nrecvrows++].index = tosizes[2*i+1]+j;
181     }
182   }
183   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
184   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
185   /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/
186   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
187   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
188   /*overlap communication and computation*/
189   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
190   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
191   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
192   /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/
193   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
194   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
195   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
196   ierr = PetscFree(toranks);CHKERRQ(ierr);
197   ierr = PetscFree(tosizes);CHKERRQ(ierr);
198   ierr = PetscFree(todata);CHKERRQ(ierr);
199   PetscFunctionReturn(0);
200 }
201 
202 #undef __FUNCT__
203 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
204 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
205 {
206   PetscInt         *isz,isz_i,i,j,is_id, data_size;
207   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
208   const PetscInt   *indices_i_temp;
209   PetscErrorCode    ierr;
210 
211   PetscFunctionBegin;
212   max_lsize = 0;
213   ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr);
214   for (i=0; i<nidx; i++){
215     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
216     max_lsize = lsize>max_lsize ? lsize:max_lsize;
217     isz[i]    = lsize;
218   }
219   ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr);
220   for (i=0; i<nidx; i++){
221     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
222     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
223     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
224     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
225   }
226   /*retrieve information */
227   for (i=0; i<nrecvs; ){
228     is_id      = recvdata[i++];
229     data_size  = recvdata[i++];
230     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
231     isz_i      = isz[is_id];
232     for (j=0; j< data_size; j++){
233       col = recvdata[i++];
234       indices_i[isz_i++] = col;
235     }
236     isz[is_id] = isz_i;
237   }
238   /*remove duplicate entities*/
239   for (i=0; i<nidx; i++){
240     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
241     isz_i      = isz[i];
242     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
243     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
244   }
245   ierr = PetscFree(isz);CHKERRQ(ierr);
246   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
247   PetscFunctionReturn(0);
248 }
249 
250 #undef __FUNCT__
251 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
252 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
253 {
254   PetscLayout       rmap,cmap;
255   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
256   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
257   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
258   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
259   Mat               amat,bmat;
260   PetscMPIInt       rank;
261   PetscBool         done;
262   MPI_Comm          comm;
263   PetscErrorCode    ierr;
264 
265   PetscFunctionBegin;
266   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
267   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
268   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
269   /* Even if the mat is symmetric, we still assume it is not symmetric*/
270   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
271   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
272   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
273   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
274   /*total number of nonzero values */
275   tnz  = ai[an]+bi[bn];
276   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
277   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
278   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
279   /*longest message */
280   max_fszs = 0;
281   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
282   /*better way to estimate number of nonzero in the mat???*/
283   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
284   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
285   rows_pos  = 0;
286   totalrows = 0;
287   for (i=0; i<nfrom; i++){
288     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
289     /*group data*/
290     for (j=0; j<fromsizes[2*i]; j+=2){
291       is_id                       = fromrows[rows_pos++];/*no of is*/
292       rows_i                      = rows_data[is_id];
293       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
294     }
295     /*estimate a space to avoid multiple allocations  */
296     for (j=0; j<nidx; j++){
297       indvc_ij = 0;
298       rows_i   = rows_data[j];
299       for (l=0; l<rows_pos_i[j]; l++){
300         row    = rows_i[l]-rstart;
301         start  = ai[row];
302         end    = ai[row+1];
303         for (k=start; k<end; k++){ /* Amat */
304           col = aj[k] + cstart;
305           indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/
306         }
307         start = bi[row];
308         end   = bi[row+1];
309         for (k=start; k<end; k++) { /* Bmat */
310           col = gcols[bj[k]];
311           indices_tmp[indvc_ij++] = col;
312         }
313       }
314       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
315       indv_counts[i*nidx+j] = indvc_ij;
316       totalrows            += indvc_ij;
317     }
318   }
319   /*message triple <no of is, number of rows, rows> */
320   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
321   totalrows = 0;
322   rows_pos  = 0;
323   /* use this code again */
324   for (i=0;i<nfrom;i++){
325     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
326     for (j=0; j<fromsizes[2*i]; j+=2){
327       is_id                       = fromrows[rows_pos++];
328       rows_i                      = rows_data[is_id];
329       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
330     }
331     /* add data  */
332     for (j=0; j<nidx; j++){
333       if (!indv_counts[i*nidx+j]) continue;
334       indvc_ij = 0;
335       sbdata[totalrows++] = j;
336       sbdata[totalrows++] = indv_counts[i*nidx+j];
337       sbsizes[2*i]       += 2;
338       rows_i              = rows_data[j];
339       for (l=0; l<rows_pos_i[j]; l++){
340         row   = rows_i[l]-rstart;
341         start = ai[row];
342         end   = ai[row+1];
343         for (k=start; k<end; k++){ /* Amat */
344           col = aj[k] + cstart;
345           indices_tmp[indvc_ij++] = col;
346         }
347         start = bi[row];
348         end   = bi[row+1];
349         for (k=start; k<end; k++) { /* Bmat */
350           col = gcols[bj[k]];
351           indices_tmp[indvc_ij++] = col;
352         }
353       }
354       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
355       sbsizes[2*i]  += indvc_ij;
356       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
357       totalrows += indvc_ij;
358     }
359   }
360   /* offsets */
361   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
362   for (i=0; i<nfrom; i++){
363     offsets[i+1]   = offsets[i] + sbsizes[2*i];
364     sbsizes[2*i+1] = offsets[i];
365   }
366   ierr = PetscFree(offsets);CHKERRQ(ierr);
367   if (sbrowsizes) *sbrowsizes = sbsizes;
368   if (sbrows) *sbrows = sbdata;
369   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
370   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
371   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
372   PetscFunctionReturn(0);
373 }
374 
375 #undef __FUNCT__
376 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
377 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
378 {
379   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
380   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
381   PetscInt          lsize,lsize_tmp,owner;
382   PetscMPIInt       rank;
383   Mat                   amat,bmat;
384   PetscBool         done;
385   PetscLayout       cmap,rmap;
386   MPI_Comm          comm;
387   PetscErrorCode    ierr;
388 
389   PetscFunctionBegin;
390   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
391   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
392   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
393   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
394   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
395   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
396   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
397   /*is it a safe way to compute number of nonzero values ?*/
398   tnz  = ai[an]+bi[bn];
399   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
400   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
401   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
402   /*it is a better way to estimate memory than the old implementation
403    * where global size of matrix is used
404    * */
405   ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr);
406   for (i=0; i<nidx; i++) {
407     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
408     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
409     lsize_tmp = 0;
410     for (j=0; j<lsize; j++) {
411       owner = -1;
412       row   = indices[j];
413       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
414       if (owner != rank) continue;
415       /*local number*/
416       row  -= rstart;
417       start = ai[row];
418       end   = ai[row+1];
419       for (k=start; k<end; k++) { /* Amat */
420         col = aj[k] + cstart;
421         indices_temp[lsize_tmp++] = col;
422       }
423       start = bi[row];
424       end   = bi[row+1];
425       for (k=start; k<end; k++) { /* Bmat */
426         col = gcols[bj[k]];
427         indices_temp[lsize_tmp++] = col;
428       }
429     }
430    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
431    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
432    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
433    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
434   }
435   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
436   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
437   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
438   PetscFunctionReturn(0);
439 }
440 
441 
442 /*
443   Sample message format:
444   If a processor A wants processor B to process some elements corresponding
445   to index sets is[1],is[5]
446   mesg [0] = 2   (no of index sets in the mesg)
447   -----------
448   mesg [1] = 1 => is[1]
449   mesg [2] = sizeof(is[1]);
450   -----------
451   mesg [3] = 5  => is[5]
452   mesg [4] = sizeof(is[5]);
453   -----------
454   mesg [5]
455   mesg [n]  datas[1]
456   -----------
457   mesg[n+1]
458   mesg[m]  data(is[5])
459   -----------
460 
461   Notes:
462   nrqs - no of requests sent (or to be sent out)
463   nrqr - no of requests recieved (which have to be or which have been processed
464 */
465 #undef __FUNCT__
466 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
467 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
468 {
469   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
470   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
471   const PetscInt **idx,*idx_i;
472   PetscInt       *n,**data,len;
473   PetscErrorCode ierr;
474   PetscMPIInt    size,rank,tag1,tag2;
475   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
476   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
477   PetscBT        *table;
478   MPI_Comm       comm;
479   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
480   MPI_Status     *s_status,*recv_status;
481   char           *t_p;
482 
483   PetscFunctionBegin;
484   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
485   size = c->size;
486   rank = c->rank;
487   M    = C->rmap->N;
488 
489   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
490   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
491 
492   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
493 
494   for (i=0; i<imax; i++) {
495     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
496     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
497   }
498 
499   /* evaluate communication - mesg to who,length of mesg, and buffer space
500      required. Based on this, buffers are allocated, and data copied into them*/
501   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
502   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
503   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
504   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
505   for (i=0; i<imax; i++) {
506     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
507     idx_i = idx[i];
508     len   = n[i];
509     for (j=0; j<len; j++) {
510       row = idx_i[j];
511       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
512       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
513       w4[proc]++;
514     }
515     for (j=0; j<size; j++) {
516       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
517     }
518   }
519 
520   nrqs     = 0;              /* no of outgoing messages */
521   msz      = 0;              /* total mesg length (for all proc */
522   w1[rank] = 0;              /* no mesg sent to intself */
523   w3[rank] = 0;
524   for (i=0; i<size; i++) {
525     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
526   }
527   /* pa - is list of processors to communicate with */
528   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
529   for (i=0,j=0; i<size; i++) {
530     if (w1[i]) {pa[j] = i; j++;}
531   }
532 
533   /* Each message would have a header = 1 + 2*(no of IS) + data */
534   for (i=0; i<nrqs; i++) {
535     j      = pa[i];
536     w1[j] += w2[j] + 2*w3[j];
537     msz   += w1[j];
538   }
539 
540   /* Determine the number of messages to expect, their lengths, from from-ids */
541   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
542   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
543 
544   /* Now post the Irecvs corresponding to these messages */
545   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
546 
547   /* Allocate Memory for outgoing messages */
548   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
549   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
550   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
551 
552   {
553     PetscInt *iptr = tmp,ict  = 0;
554     for (i=0; i<nrqs; i++) {
555       j         = pa[i];
556       iptr     +=  ict;
557       outdat[j] = iptr;
558       ict       = w1[j];
559     }
560   }
561 
562   /* Form the outgoing messages */
563   /*plug in the headers*/
564   for (i=0; i<nrqs; i++) {
565     j            = pa[i];
566     outdat[j][0] = 0;
567     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
568     ptr[j]       = outdat[j] + 2*w3[j] + 1;
569   }
570 
571   /* Memory for doing local proc's work*/
572   {
573     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
574 
575     for (i=0; i<imax; i++) {
576       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
577       data[i]  = d_p + M*i;
578     }
579   }
580 
581   /* Parse the IS and update local tables and the outgoing buf with the data*/
582   {
583     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
584     PetscBT  table_i;
585 
586     for (i=0; i<imax; i++) {
587       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
588       n_i     = n[i];
589       table_i = table[i];
590       idx_i   = idx[i];
591       data_i  = data[i];
592       isz_i   = isz[i];
593       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
594         row  = idx_i[j];
595         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
596         if (proc != rank) { /* copy to the outgoing buffer */
597           ctr[proc]++;
598           *ptr[proc] = row;
599           ptr[proc]++;
600         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
601       }
602       /* Update the headers for the current IS */
603       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
604         if ((ctr_j = ctr[j])) {
605           outdat_j        = outdat[j];
606           k               = ++outdat_j[0];
607           outdat_j[2*k]   = ctr_j;
608           outdat_j[2*k-1] = i;
609         }
610       }
611       isz[i] = isz_i;
612     }
613   }
614 
615   /*  Now  post the sends */
616   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
617   for (i=0; i<nrqs; ++i) {
618     j    = pa[i];
619     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
620   }
621 
622   /* No longer need the original indices*/
623   for (i=0; i<imax; ++i) {
624     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
625   }
626   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
627 
628   for (i=0; i<imax; ++i) {
629     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
630   }
631 
632   /* Do Local work*/
633   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
634 
635   /* Receive messages*/
636   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
637   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
638 
639   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
640   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
641 
642   /* Phase 1 sends are complete - deallocate buffers */
643   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
644   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
645 
646   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
647   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
648   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
649   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
650   ierr = PetscFree(rbuf);CHKERRQ(ierr);
651 
652 
653   /* Send the data back*/
654   /* Do a global reduction to know the buffer space req for incoming messages*/
655   {
656     PetscMPIInt *rw1;
657 
658     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
659 
660     for (i=0; i<nrqr; ++i) {
661       proc = recv_status[i].MPI_SOURCE;
662 
663       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
664       rw1[proc] = isz1[i];
665     }
666     ierr = PetscFree(onodes1);CHKERRQ(ierr);
667     ierr = PetscFree(olengths1);CHKERRQ(ierr);
668 
669     /* Determine the number of messages to expect, their lengths, from from-ids */
670     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
671     ierr = PetscFree(rw1);CHKERRQ(ierr);
672   }
673   /* Now post the Irecvs corresponding to these messages */
674   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
675 
676   /*  Now  post the sends */
677   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
678   for (i=0; i<nrqr; ++i) {
679     j    = recv_status[i].MPI_SOURCE;
680     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
681   }
682 
683   /* receive work done on other processors*/
684   {
685     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
686     PetscMPIInt idex;
687     PetscBT     table_i;
688     MPI_Status  *status2;
689 
690     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
691     for (i=0; i<nrqs; ++i) {
692       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
693       /* Process the message*/
694       rbuf2_i = rbuf2[idex];
695       ct1     = 2*rbuf2_i[0]+1;
696       jmax    = rbuf2[idex][0];
697       for (j=1; j<=jmax; j++) {
698         max     = rbuf2_i[2*j];
699         is_no   = rbuf2_i[2*j-1];
700         isz_i   = isz[is_no];
701         data_i  = data[is_no];
702         table_i = table[is_no];
703         for (k=0; k<max; k++,ct1++) {
704           row = rbuf2_i[ct1];
705           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
706         }
707         isz[is_no] = isz_i;
708       }
709     }
710 
711     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
712     ierr = PetscFree(status2);CHKERRQ(ierr);
713   }
714 
715   for (i=0; i<imax; ++i) {
716     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
717   }
718 
719   ierr = PetscFree(onodes2);CHKERRQ(ierr);
720   ierr = PetscFree(olengths2);CHKERRQ(ierr);
721 
722   ierr = PetscFree(pa);CHKERRQ(ierr);
723   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
724   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
725   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
726   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
727   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
728   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
729   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
730   ierr = PetscFree(s_status);CHKERRQ(ierr);
731   ierr = PetscFree(recv_status);CHKERRQ(ierr);
732   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
733   ierr = PetscFree(xdata);CHKERRQ(ierr);
734   ierr = PetscFree(isz1);CHKERRQ(ierr);
735   PetscFunctionReturn(0);
736 }
737 
738 #undef __FUNCT__
739 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
740 /*
741    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
742        the work on the local processor.
743 
744      Inputs:
745       C      - MAT_MPIAIJ;
746       imax - total no of index sets processed at a time;
747       table  - an array of char - size = m bits.
748 
749      Output:
750       isz    - array containing the count of the solution elements corresponding
751                to each index set;
752       data   - pointer to the solutions
753 */
754 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
755 {
756   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
757   Mat        A  = c->A,B = c->B;
758   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
759   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
760   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
761   PetscBT    table_i;
762 
763   PetscFunctionBegin;
764   rstart = C->rmap->rstart;
765   cstart = C->cmap->rstart;
766   ai     = a->i;
767   aj     = a->j;
768   bi     = b->i;
769   bj     = b->j;
770   garray = c->garray;
771 
772 
773   for (i=0; i<imax; i++) {
774     data_i  = data[i];
775     table_i = table[i];
776     isz_i   = isz[i];
777     for (j=0,max=isz[i]; j<max; j++) {
778       row   = data_i[j] - rstart;
779       start = ai[row];
780       end   = ai[row+1];
781       for (k=start; k<end; k++) { /* Amat */
782         val = aj[k] + cstart;
783         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
784       }
785       start = bi[row];
786       end   = bi[row+1];
787       for (k=start; k<end; k++) { /* Bmat */
788         val = garray[bj[k]];
789         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
790       }
791     }
792     isz[i] = isz_i;
793   }
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNCT__
798 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
799 /*
800       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
801          and return the output
802 
803          Input:
804            C    - the matrix
805            nrqr - no of messages being processed.
806            rbuf - an array of pointers to the recieved requests
807 
808          Output:
809            xdata - array of messages to be sent back
810            isz1  - size of each message
811 
812   For better efficiency perhaps we should malloc separately each xdata[i],
813 then if a remalloc is required we need only copy the data for that one row
814 rather then all previous rows as it is now where a single large chunck of
815 memory is used.
816 
817 */
818 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
819 {
820   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
821   Mat            A  = c->A,B = c->B;
822   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
823   PetscErrorCode ierr;
824   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
825   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
826   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
827   PetscInt       *rbuf_i,kmax,rbuf_0;
828   PetscBT        xtable;
829 
830   PetscFunctionBegin;
831   m      = C->rmap->N;
832   rstart = C->rmap->rstart;
833   cstart = C->cmap->rstart;
834   ai     = a->i;
835   aj     = a->j;
836   bi     = b->i;
837   bj     = b->j;
838   garray = c->garray;
839 
840 
841   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
842     rbuf_i =  rbuf[i];
843     rbuf_0 =  rbuf_i[0];
844     ct    += rbuf_0;
845     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
846   }
847 
848   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
849   else max1 = 1;
850   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
851   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
852   ++no_malloc;
853   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
854   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
855 
856   ct3 = 0;
857   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
858     rbuf_i =  rbuf[i];
859     rbuf_0 =  rbuf_i[0];
860     ct1    =  2*rbuf_0+1;
861     ct2    =  ct1;
862     ct3   += ct1;
863     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
864       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
865       oct2 = ct2;
866       kmax = rbuf_i[2*j];
867       for (k=0; k<kmax; k++,ct1++) {
868         row = rbuf_i[ct1];
869         if (!PetscBTLookupSet(xtable,row)) {
870           if (!(ct3 < mem_estimate)) {
871             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
872             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
873             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
874             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
875             xdata[0]     = tmp;
876             mem_estimate = new_estimate; ++no_malloc;
877             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
878           }
879           xdata[i][ct2++] = row;
880           ct3++;
881         }
882       }
883       for (k=oct2,max2=ct2; k<max2; k++) {
884         row   = xdata[i][k] - rstart;
885         start = ai[row];
886         end   = ai[row+1];
887         for (l=start; l<end; l++) {
888           val = aj[l] + cstart;
889           if (!PetscBTLookupSet(xtable,val)) {
890             if (!(ct3 < mem_estimate)) {
891               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
892               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
893               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
894               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
895               xdata[0]     = tmp;
896               mem_estimate = new_estimate; ++no_malloc;
897               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
898             }
899             xdata[i][ct2++] = val;
900             ct3++;
901           }
902         }
903         start = bi[row];
904         end   = bi[row+1];
905         for (l=start; l<end; l++) {
906           val = garray[bj[l]];
907           if (!PetscBTLookupSet(xtable,val)) {
908             if (!(ct3 < mem_estimate)) {
909               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
910               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
911               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
912               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
913               xdata[0]     = tmp;
914               mem_estimate = new_estimate; ++no_malloc;
915               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
916             }
917             xdata[i][ct2++] = val;
918             ct3++;
919           }
920         }
921       }
922       /* Update the header*/
923       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
924       xdata[i][2*j-1] = rbuf_i[2*j-1];
925     }
926     xdata[i][0] = rbuf_0;
927     xdata[i+1]  = xdata[i] + ct2;
928     isz1[i]     = ct2; /* size of each message */
929   }
930   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
931   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
932   PetscFunctionReturn(0);
933 }
934 /* -------------------------------------------------------------------------*/
935 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
936 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
937 /*
938     Every processor gets the entire matrix
939 */
940 #undef __FUNCT__
941 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
942 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
943 {
944   Mat            B;
945   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
946   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
947   PetscErrorCode ierr;
948   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
949   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
950   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
951   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
952 
953   PetscFunctionBegin;
954   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
955   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
956 
957   if (scall == MAT_INITIAL_MATRIX) {
958     /* ----------------------------------------------------------------
959          Tell every processor the number of nonzeros per row
960     */
961     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
962     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
963       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
964     }
965     sendcount = A->rmap->rend - A->rmap->rstart;
966     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
967     for (i=0; i<size; i++) {
968       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
969       displs[i]     = A->rmap->range[i];
970     }
971 #if defined(PETSC_HAVE_MPI_IN_PLACE)
972     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
973 #else
974     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
975 #endif
976     /* ---------------------------------------------------------------
977          Create the sequential matrix of the same type as the local block diagonal
978     */
979     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
980     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
981     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
982     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
983     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
984     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
985     **Bin = B;
986     b     = (Mat_SeqAIJ*)B->data;
987 
988     /*--------------------------------------------------------------------
989        Copy my part of matrix column indices over
990     */
991     sendcount  = ad->nz + bd->nz;
992     jsendbuf   = b->j + b->i[rstarts[rank]];
993     a_jsendbuf = ad->j;
994     b_jsendbuf = bd->j;
995     n          = A->rmap->rend - A->rmap->rstart;
996     cnt        = 0;
997     for (i=0; i<n; i++) {
998 
999       /* put in lower diagonal portion */
1000       m = bd->i[i+1] - bd->i[i];
1001       while (m > 0) {
1002         /* is it above diagonal (in bd (compressed) numbering) */
1003         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1004         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1005         m--;
1006       }
1007 
1008       /* put in diagonal portion */
1009       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1010         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1011       }
1012 
1013       /* put in upper diagonal portion */
1014       while (m-- > 0) {
1015         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1016       }
1017     }
1018     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1019 
1020     /*--------------------------------------------------------------------
1021        Gather all column indices to all processors
1022     */
1023     for (i=0; i<size; i++) {
1024       recvcounts[i] = 0;
1025       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1026         recvcounts[i] += lens[j];
1027       }
1028     }
1029     displs[0] = 0;
1030     for (i=1; i<size; i++) {
1031       displs[i] = displs[i-1] + recvcounts[i-1];
1032     }
1033 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1034     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1035 #else
1036     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1037 #endif
1038     /*--------------------------------------------------------------------
1039         Assemble the matrix into useable form (note numerical values not yet set)
1040     */
1041     /* set the b->ilen (length of each row) values */
1042     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1043     /* set the b->i indices */
1044     b->i[0] = 0;
1045     for (i=1; i<=A->rmap->N; i++) {
1046       b->i[i] = b->i[i-1] + lens[i-1];
1047     }
1048     ierr = PetscFree(lens);CHKERRQ(ierr);
1049     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1050     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1051 
1052   } else {
1053     B = **Bin;
1054     b = (Mat_SeqAIJ*)B->data;
1055   }
1056 
1057   /*--------------------------------------------------------------------
1058        Copy my part of matrix numerical values into the values location
1059   */
1060   if (flag == MAT_GET_VALUES) {
1061     sendcount = ad->nz + bd->nz;
1062     sendbuf   = b->a + b->i[rstarts[rank]];
1063     a_sendbuf = ad->a;
1064     b_sendbuf = bd->a;
1065     b_sendj   = bd->j;
1066     n         = A->rmap->rend - A->rmap->rstart;
1067     cnt       = 0;
1068     for (i=0; i<n; i++) {
1069 
1070       /* put in lower diagonal portion */
1071       m = bd->i[i+1] - bd->i[i];
1072       while (m > 0) {
1073         /* is it above diagonal (in bd (compressed) numbering) */
1074         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1075         sendbuf[cnt++] = *b_sendbuf++;
1076         m--;
1077         b_sendj++;
1078       }
1079 
1080       /* put in diagonal portion */
1081       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1082         sendbuf[cnt++] = *a_sendbuf++;
1083       }
1084 
1085       /* put in upper diagonal portion */
1086       while (m-- > 0) {
1087         sendbuf[cnt++] = *b_sendbuf++;
1088         b_sendj++;
1089       }
1090     }
1091     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1092 
1093     /* -----------------------------------------------------------------
1094        Gather all numerical values to all processors
1095     */
1096     if (!recvcounts) {
1097       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1098     }
1099     for (i=0; i<size; i++) {
1100       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1101     }
1102     displs[0] = 0;
1103     for (i=1; i<size; i++) {
1104       displs[i] = displs[i-1] + recvcounts[i-1];
1105     }
1106     recvbuf = b->a;
1107 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1108     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1109 #else
1110     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1111 #endif
1112   }  /* endof (flag == MAT_GET_VALUES) */
1113   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1114 
1115   if (A->symmetric) {
1116     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1117   } else if (A->hermitian) {
1118     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1119   } else if (A->structurally_symmetric) {
1120     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1121   }
1122   PetscFunctionReturn(0);
1123 }
1124 
1125 
1126 
1127 #undef __FUNCT__
1128 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
1129 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1130 {
1131   PetscErrorCode ierr;
1132   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1133   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1134 
1135   PetscFunctionBegin;
1136 
1137   /*
1138        Check for special case: each processor gets entire matrix
1139   */
1140   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1141     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1142     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1143     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1144     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1145     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1146       wantallmatrix = PETSC_TRUE;
1147 
1148       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1149     }
1150   }
1151   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1152   if (twantallmatrix) {
1153     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
1154     PetscFunctionReturn(0);
1155   }
1156 
1157   /* Allocate memory to hold all the submatrices */
1158   if (scall != MAT_REUSE_MATRIX) {
1159     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
1160   }
1161 
1162   /* Check for special case: each processor gets entire matrix columns */
1163   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
1164   for (i=0; i<ismax; i++) {
1165     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
1166     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
1167     if (colflag && ncol == C->cmap->N) {
1168       allcolumns[i] = PETSC_TRUE;
1169     } else {
1170       allcolumns[i] = PETSC_FALSE;
1171     }
1172   }
1173 
1174   /* Determine the number of stages through which submatrices are done */
1175   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1176 
1177   /*
1178      Each stage will extract nmax submatrices.
1179      nmax is determined by the matrix column dimension.
1180      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1181   */
1182   if (!nmax) nmax = 1;
1183   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1184 
1185   /* Make sure every processor loops through the nstages */
1186   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1187 
1188   for (i=0,pos=0; i<nstages; i++) {
1189     if (pos+nmax <= ismax) max_no = nmax;
1190     else if (pos == ismax) max_no = 0;
1191     else                   max_no = ismax-pos;
1192     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
1193     pos += max_no;
1194   }
1195 
1196   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 /* -------------------------------------------------------------------------*/
1201 #undef __FUNCT__
1202 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
1203 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1204 {
1205   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1206   Mat            A  = c->A;
1207   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1208   const PetscInt **icol,**irow;
1209   PetscInt       *nrow,*ncol,start;
1210   PetscErrorCode ierr;
1211   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1212   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1213   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1214   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1215   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1216 #if defined(PETSC_USE_CTABLE)
1217   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1218 #else
1219   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1220 #endif
1221   const PetscInt *irow_i;
1222   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1223   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1224   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1225   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1226   MPI_Status     *r_status3,*r_status4,*s_status4;
1227   MPI_Comm       comm;
1228   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1229   PetscMPIInt    *onodes1,*olengths1;
1230   PetscMPIInt    idex,idex2,end;
1231 
1232   PetscFunctionBegin;
1233   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1234   tag0 = ((PetscObject)C)->tag;
1235   size = c->size;
1236   rank = c->rank;
1237 
1238   /* Get some new tags to keep the communication clean */
1239   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1240   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1241   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1242 
1243   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
1244 
1245   for (i=0; i<ismax; i++) {
1246     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
1247     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
1248     if (allcolumns[i]) {
1249       icol[i] = NULL;
1250       ncol[i] = C->cmap->N;
1251     } else {
1252       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
1253       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
1254     }
1255   }
1256 
1257   /* evaluate communication - mesg to who, length of mesg, and buffer space
1258      required. Based on this, buffers are allocated, and data copied into them*/
1259   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
1260   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1261   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1262   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1263   for (i=0; i<ismax; i++) {
1264     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
1265     jmax   = nrow[i];
1266     irow_i = irow[i];
1267     for (j=0; j<jmax; j++) {
1268       l   = 0;
1269       row = irow_i[j];
1270       while (row >= C->rmap->range[l+1]) l++;
1271       proc = l;
1272       w4[proc]++;
1273     }
1274     for (j=0; j<size; j++) {
1275       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1276     }
1277   }
1278 
1279   nrqs     = 0;              /* no of outgoing messages */
1280   msz      = 0;              /* total mesg length (for all procs) */
1281   w1[rank] = 0;              /* no mesg sent to self */
1282   w3[rank] = 0;
1283   for (i=0; i<size; i++) {
1284     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1285   }
1286   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1287   for (i=0,j=0; i<size; i++) {
1288     if (w1[i]) { pa[j] = i; j++; }
1289   }
1290 
1291   /* Each message would have a header = 1 + 2*(no of IS) + data */
1292   for (i=0; i<nrqs; i++) {
1293     j      = pa[i];
1294     w1[j] += w2[j] + 2* w3[j];
1295     msz   += w1[j];
1296   }
1297   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1298 
1299   /* Determine the number of messages to expect, their lengths, from from-ids */
1300   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1301   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1302 
1303   /* Now post the Irecvs corresponding to these messages */
1304   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1305 
1306   ierr = PetscFree(onodes1);CHKERRQ(ierr);
1307   ierr = PetscFree(olengths1);CHKERRQ(ierr);
1308 
1309   /* Allocate Memory for outgoing messages */
1310   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1311   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1312   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1313 
1314   {
1315     PetscInt *iptr = tmp,ict = 0;
1316     for (i=0; i<nrqs; i++) {
1317       j        = pa[i];
1318       iptr    += ict;
1319       sbuf1[j] = iptr;
1320       ict      = w1[j];
1321     }
1322   }
1323 
1324   /* Form the outgoing messages */
1325   /* Initialize the header space */
1326   for (i=0; i<nrqs; i++) {
1327     j           = pa[i];
1328     sbuf1[j][0] = 0;
1329     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
1330     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1331   }
1332 
1333   /* Parse the isrow and copy data into outbuf */
1334   for (i=0; i<ismax; i++) {
1335     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1336     irow_i = irow[i];
1337     jmax   = nrow[i];
1338     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1339       l   = 0;
1340       row = irow_i[j];
1341       while (row >= C->rmap->range[l+1]) l++;
1342       proc = l;
1343       if (proc != rank) { /* copy to the outgoing buf*/
1344         ctr[proc]++;
1345         *ptr[proc] = row;
1346         ptr[proc]++;
1347       }
1348     }
1349     /* Update the headers for the current IS */
1350     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1351       if ((ctr_j = ctr[j])) {
1352         sbuf1_j        = sbuf1[j];
1353         k              = ++sbuf1_j[0];
1354         sbuf1_j[2*k]   = ctr_j;
1355         sbuf1_j[2*k-1] = i;
1356       }
1357     }
1358   }
1359 
1360   /*  Now  post the sends */
1361   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1362   for (i=0; i<nrqs; ++i) {
1363     j    = pa[i];
1364     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
1365   }
1366 
1367   /* Post Receives to capture the buffer size */
1368   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
1369   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
1370   rbuf2[0] = tmp + msz;
1371   for (i=1; i<nrqs; ++i) {
1372     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1373   }
1374   for (i=0; i<nrqs; ++i) {
1375     j    = pa[i];
1376     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
1377   }
1378 
1379   /* Send to other procs the buf size they should allocate */
1380 
1381 
1382   /* Receive messages*/
1383   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
1384   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1385   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
1386   {
1387     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1388     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1389     PetscInt   *sbuf2_i;
1390 
1391     for (i=0; i<nrqr; ++i) {
1392       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1393 
1394       req_size[idex] = 0;
1395       rbuf1_i        = rbuf1[idex];
1396       start          = 2*rbuf1_i[0] + 1;
1397       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1398       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
1399       sbuf2_i        = sbuf2[idex];
1400       for (j=start; j<end; j++) {
1401         id              = rbuf1_i[j] - rstart;
1402         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1403         sbuf2_i[j]      = ncols;
1404         req_size[idex] += ncols;
1405       }
1406       req_source[idex] = r_status1[i].MPI_SOURCE;
1407       /* form the header */
1408       sbuf2_i[0] = req_size[idex];
1409       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1410 
1411       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1412     }
1413   }
1414   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1415   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1416 
1417   /*  recv buffer sizes */
1418   /* Receive messages*/
1419 
1420   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1421   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1422   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1423   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1424   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1425 
1426   for (i=0; i<nrqs; ++i) {
1427     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1428     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1429     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1430     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1431     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1432   }
1433   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1434   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1435 
1436   /* Wait on sends1 and sends2 */
1437   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1438   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1439 
1440   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1441   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1442   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1443   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1444   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1445   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1446 
1447   /* Now allocate buffers for a->j, and send them off */
1448   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1449   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1450   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1451   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1452 
1453   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1454   {
1455     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1456     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1457     PetscInt cend = C->cmap->rend;
1458     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1459 
1460     for (i=0; i<nrqr; i++) {
1461       rbuf1_i   = rbuf1[i];
1462       sbuf_aj_i = sbuf_aj[i];
1463       ct1       = 2*rbuf1_i[0] + 1;
1464       ct2       = 0;
1465       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1466         kmax = rbuf1[i][2*j];
1467         for (k=0; k<kmax; k++,ct1++) {
1468           row    = rbuf1_i[ct1] - rstart;
1469           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1470           ncols  = nzA + nzB;
1471           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1472 
1473           /* load the column indices for this row into cols*/
1474           cols = sbuf_aj_i + ct2;
1475 
1476           lwrite = 0;
1477           for (l=0; l<nzB; l++) {
1478             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1479           }
1480           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1481           for (l=0; l<nzB; l++) {
1482             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1483           }
1484 
1485           ct2 += ncols;
1486         }
1487       }
1488       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1489     }
1490   }
1491   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1492   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1493 
1494   /* Allocate buffers for a->a, and send them off */
1495   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1496   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1497   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1498   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1499 
1500   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1501   {
1502     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1503     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1504     PetscInt    cend   = C->cmap->rend;
1505     PetscInt    *b_j   = b->j;
1506     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1507 
1508     for (i=0; i<nrqr; i++) {
1509       rbuf1_i   = rbuf1[i];
1510       sbuf_aa_i = sbuf_aa[i];
1511       ct1       = 2*rbuf1_i[0]+1;
1512       ct2       = 0;
1513       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1514         kmax = rbuf1_i[2*j];
1515         for (k=0; k<kmax; k++,ct1++) {
1516           row    = rbuf1_i[ct1] - rstart;
1517           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1518           ncols  = nzA + nzB;
1519           cworkB = b_j + b_i[row];
1520           vworkA = a_a + a_i[row];
1521           vworkB = b_a + b_i[row];
1522 
1523           /* load the column values for this row into vals*/
1524           vals = sbuf_aa_i+ct2;
1525 
1526           lwrite = 0;
1527           for (l=0; l<nzB; l++) {
1528             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1529           }
1530           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1531           for (l=0; l<nzB; l++) {
1532             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1533           }
1534 
1535           ct2 += ncols;
1536         }
1537       }
1538       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1539     }
1540   }
1541   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1542   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1543   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1544   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1545 
1546   /* Form the matrix */
1547   /* create col map: global col of C -> local col of submatrices */
1548   {
1549     const PetscInt *icol_i;
1550 #if defined(PETSC_USE_CTABLE)
1551     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1552     for (i=0; i<ismax; i++) {
1553       if (!allcolumns[i]) {
1554         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1555 
1556         jmax   = ncol[i];
1557         icol_i = icol[i];
1558         cmap_i = cmap[i];
1559         for (j=0; j<jmax; j++) {
1560           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1561         }
1562       } else {
1563         cmap[i] = NULL;
1564       }
1565     }
1566 #else
1567     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1568     for (i=0; i<ismax; i++) {
1569       if (!allcolumns[i]) {
1570         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1571         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1572         jmax   = ncol[i];
1573         icol_i = icol[i];
1574         cmap_i = cmap[i];
1575         for (j=0; j<jmax; j++) {
1576           cmap_i[icol_i[j]] = j+1;
1577         }
1578       } else {
1579         cmap[i] = NULL;
1580       }
1581     }
1582 #endif
1583   }
1584 
1585   /* Create lens which is required for MatCreate... */
1586   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1587   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1588   if (ismax) {
1589     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1590     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1591   }
1592   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1593 
1594   /* Update lens from local data */
1595   for (i=0; i<ismax; i++) {
1596     jmax = nrow[i];
1597     if (!allcolumns[i]) cmap_i = cmap[i];
1598     irow_i = irow[i];
1599     lens_i = lens[i];
1600     for (j=0; j<jmax; j++) {
1601       l   = 0;
1602       row = irow_i[j];
1603       while (row >= C->rmap->range[l+1]) l++;
1604       proc = l;
1605       if (proc == rank) {
1606         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1607         if (!allcolumns[i]) {
1608           for (k=0; k<ncols; k++) {
1609 #if defined(PETSC_USE_CTABLE)
1610             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1611 #else
1612             tcol = cmap_i[cols[k]];
1613 #endif
1614             if (tcol) lens_i[j]++;
1615           }
1616         } else { /* allcolumns */
1617           lens_i[j] = ncols;
1618         }
1619         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1620       }
1621     }
1622   }
1623 
1624   /* Create row map: global row of C -> local row of submatrices */
1625 #if defined(PETSC_USE_CTABLE)
1626   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1627   for (i=0; i<ismax; i++) {
1628     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1629     rmap_i = rmap[i];
1630     irow_i = irow[i];
1631     jmax   = nrow[i];
1632     for (j=0; j<jmax; j++) {
1633       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1634     }
1635   }
1636 #else
1637   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1638   if (ismax) {
1639     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1640     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1641   }
1642   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1643   for (i=0; i<ismax; i++) {
1644     rmap_i = rmap[i];
1645     irow_i = irow[i];
1646     jmax   = nrow[i];
1647     for (j=0; j<jmax; j++) {
1648       rmap_i[irow_i[j]] = j;
1649     }
1650   }
1651 #endif
1652 
1653   /* Update lens from offproc data */
1654   {
1655     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1656 
1657     for (tmp2=0; tmp2<nrqs; tmp2++) {
1658       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1659       idex    = pa[idex2];
1660       sbuf1_i = sbuf1[idex];
1661       jmax    = sbuf1_i[0];
1662       ct1     = 2*jmax+1;
1663       ct2     = 0;
1664       rbuf2_i = rbuf2[idex2];
1665       rbuf3_i = rbuf3[idex2];
1666       for (j=1; j<=jmax; j++) {
1667         is_no  = sbuf1_i[2*j-1];
1668         max1   = sbuf1_i[2*j];
1669         lens_i = lens[is_no];
1670         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1671         rmap_i = rmap[is_no];
1672         for (k=0; k<max1; k++,ct1++) {
1673 #if defined(PETSC_USE_CTABLE)
1674           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1675           row--;
1676           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1677 #else
1678           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1679 #endif
1680           max2 = rbuf2_i[ct1];
1681           for (l=0; l<max2; l++,ct2++) {
1682             if (!allcolumns[is_no]) {
1683 #if defined(PETSC_USE_CTABLE)
1684               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1685 #else
1686               tcol = cmap_i[rbuf3_i[ct2]];
1687 #endif
1688               if (tcol) lens_i[row]++;
1689             } else { /* allcolumns */
1690               lens_i[row]++; /* lens_i[row] += max2 ? */
1691             }
1692           }
1693         }
1694       }
1695     }
1696   }
1697   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1698   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1699   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1700   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1701   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1702 
1703   /* Create the submatrices */
1704   if (scall == MAT_REUSE_MATRIX) {
1705     PetscBool flag;
1706 
1707     /*
1708         Assumes new rows are same length as the old rows,hence bug!
1709     */
1710     for (i=0; i<ismax; i++) {
1711       mat = (Mat_SeqAIJ*)(submats[i]->data);
1712       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1713       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1714       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1715       /* Initial matrix as if empty */
1716       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1717 
1718       submats[i]->factortype = C->factortype;
1719     }
1720   } else {
1721     for (i=0; i<ismax; i++) {
1722       PetscInt rbs,cbs;
1723       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1724       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1725 
1726       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1727       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1728 
1729       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1730       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1731       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1732     }
1733   }
1734 
1735   /* Assemble the matrices */
1736   /* First assemble the local rows */
1737   {
1738     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1739     PetscScalar *imat_a;
1740 
1741     for (i=0; i<ismax; i++) {
1742       mat       = (Mat_SeqAIJ*)submats[i]->data;
1743       imat_ilen = mat->ilen;
1744       imat_j    = mat->j;
1745       imat_i    = mat->i;
1746       imat_a    = mat->a;
1747 
1748       if (!allcolumns[i]) cmap_i = cmap[i];
1749       rmap_i = rmap[i];
1750       irow_i = irow[i];
1751       jmax   = nrow[i];
1752       for (j=0; j<jmax; j++) {
1753         l   = 0;
1754         row = irow_i[j];
1755         while (row >= C->rmap->range[l+1]) l++;
1756         proc = l;
1757         if (proc == rank) {
1758           old_row = row;
1759 #if defined(PETSC_USE_CTABLE)
1760           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1761           row--;
1762 #else
1763           row = rmap_i[row];
1764 #endif
1765           ilen_row = imat_ilen[row];
1766           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1767           mat_i    = imat_i[row];
1768           mat_a    = imat_a + mat_i;
1769           mat_j    = imat_j + mat_i;
1770           if (!allcolumns[i]) {
1771             for (k=0; k<ncols; k++) {
1772 #if defined(PETSC_USE_CTABLE)
1773               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1774 #else
1775               tcol = cmap_i[cols[k]];
1776 #endif
1777               if (tcol) {
1778                 *mat_j++ = tcol - 1;
1779                 *mat_a++ = vals[k];
1780                 ilen_row++;
1781               }
1782             }
1783           } else { /* allcolumns */
1784             for (k=0; k<ncols; k++) {
1785               *mat_j++ = cols[k];  /* global col index! */
1786               *mat_a++ = vals[k];
1787               ilen_row++;
1788             }
1789           }
1790           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1791 
1792           imat_ilen[row] = ilen_row;
1793         }
1794       }
1795     }
1796   }
1797 
1798   /*   Now assemble the off proc rows*/
1799   {
1800     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1801     PetscInt    *imat_j,*imat_i;
1802     PetscScalar *imat_a,*rbuf4_i;
1803 
1804     for (tmp2=0; tmp2<nrqs; tmp2++) {
1805       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1806       idex    = pa[idex2];
1807       sbuf1_i = sbuf1[idex];
1808       jmax    = sbuf1_i[0];
1809       ct1     = 2*jmax + 1;
1810       ct2     = 0;
1811       rbuf2_i = rbuf2[idex2];
1812       rbuf3_i = rbuf3[idex2];
1813       rbuf4_i = rbuf4[idex2];
1814       for (j=1; j<=jmax; j++) {
1815         is_no     = sbuf1_i[2*j-1];
1816         rmap_i    = rmap[is_no];
1817         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1818         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1819         imat_ilen = mat->ilen;
1820         imat_j    = mat->j;
1821         imat_i    = mat->i;
1822         imat_a    = mat->a;
1823         max1      = sbuf1_i[2*j];
1824         for (k=0; k<max1; k++,ct1++) {
1825           row = sbuf1_i[ct1];
1826 #if defined(PETSC_USE_CTABLE)
1827           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1828           row--;
1829 #else
1830           row = rmap_i[row];
1831 #endif
1832           ilen  = imat_ilen[row];
1833           mat_i = imat_i[row];
1834           mat_a = imat_a + mat_i;
1835           mat_j = imat_j + mat_i;
1836           max2  = rbuf2_i[ct1];
1837           if (!allcolumns[is_no]) {
1838             for (l=0; l<max2; l++,ct2++) {
1839 
1840 #if defined(PETSC_USE_CTABLE)
1841               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1842 #else
1843               tcol = cmap_i[rbuf3_i[ct2]];
1844 #endif
1845               if (tcol) {
1846                 *mat_j++ = tcol - 1;
1847                 *mat_a++ = rbuf4_i[ct2];
1848                 ilen++;
1849               }
1850             }
1851           } else { /* allcolumns */
1852             for (l=0; l<max2; l++,ct2++) {
1853               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1854               *mat_a++ = rbuf4_i[ct2];
1855               ilen++;
1856             }
1857           }
1858           imat_ilen[row] = ilen;
1859         }
1860       }
1861     }
1862   }
1863 
1864   /* sort the rows */
1865   {
1866     PetscInt    *imat_ilen,*imat_j,*imat_i;
1867     PetscScalar *imat_a;
1868 
1869     for (i=0; i<ismax; i++) {
1870       mat       = (Mat_SeqAIJ*)submats[i]->data;
1871       imat_j    = mat->j;
1872       imat_i    = mat->i;
1873       imat_a    = mat->a;
1874       imat_ilen = mat->ilen;
1875 
1876       if (allcolumns[i]) continue;
1877       jmax = nrow[i];
1878       for (j=0; j<jmax; j++) {
1879         PetscInt ilen;
1880 
1881         mat_i = imat_i[j];
1882         mat_a = imat_a + mat_i;
1883         mat_j = imat_j + mat_i;
1884         ilen  = imat_ilen[j];
1885         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
1886       }
1887     }
1888   }
1889 
1890   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1891   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1892   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1893   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1894   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1895 
1896   /* Restore the indices */
1897   for (i=0; i<ismax; i++) {
1898     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1899     if (!allcolumns[i]) {
1900       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1901     }
1902   }
1903 
1904   /* Destroy allocated memory */
1905   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1906   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1907   ierr = PetscFree(pa);CHKERRQ(ierr);
1908 
1909   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1910   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1911   for (i=0; i<nrqr; ++i) {
1912     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1913   }
1914   for (i=0; i<nrqs; ++i) {
1915     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1916     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1917   }
1918 
1919   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1920   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1921   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1922   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1923   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1924   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1925   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1926 
1927 #if defined(PETSC_USE_CTABLE)
1928   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1929 #else
1930   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1931 #endif
1932   ierr = PetscFree(rmap);CHKERRQ(ierr);
1933 
1934   for (i=0; i<ismax; i++) {
1935     if (!allcolumns[i]) {
1936 #if defined(PETSC_USE_CTABLE)
1937       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1938 #else
1939       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1940 #endif
1941     }
1942   }
1943   ierr = PetscFree(cmap);CHKERRQ(ierr);
1944   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1945   ierr = PetscFree(lens);CHKERRQ(ierr);
1946 
1947   for (i=0; i<ismax; i++) {
1948     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1949     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1950   }
1951   PetscFunctionReturn(0);
1952 }
1953 
1954 /*
1955  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
1956  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
1957  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
1958  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
1959  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
1960  state, and needs to be "assembled" later by compressing B's column space.
1961 
1962  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
1963  Following this call, C->A & C->B have been created, even if empty.
1964  */
1965 #undef __FUNCT__
1966 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
1967 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
1968 {
1969   /* If making this function public, change the error returned in this function away from _PLIB. */
1970   PetscErrorCode ierr;
1971   Mat_MPIAIJ     *aij;
1972   Mat_SeqAIJ     *Baij;
1973   PetscBool      seqaij,Bdisassembled;
1974   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
1975   PetscScalar    v;
1976   const PetscInt *rowindices,*colindices;
1977 
1978   PetscFunctionBegin;
1979   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
1980   if (A) {
1981     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
1982     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
1983     if (rowemb) {
1984       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
1985       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
1986     } else {
1987       if (C->rmap->n != A->rmap->n) {
1988 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
1989       }
1990     }
1991     if (dcolemb) {
1992       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
1993       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
1994     } else {
1995       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
1996     }
1997   }
1998   if (B) {
1999     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2000     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2001     if (rowemb) {
2002       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2003       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2004     } else {
2005       if (C->rmap->n != B->rmap->n) {
2006 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2007       }
2008     }
2009     if (ocolemb) {
2010       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2011       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2012     } else {
2013       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2014     }
2015   }
2016 
2017   aij    = (Mat_MPIAIJ*)(C->data);
2018   if (!aij->A) {
2019     /* Mimic parts of MatMPIAIJSetPreallocation() */
2020     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2021     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2022     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2023     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2024     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2025   }
2026   if (A) {
2027     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2028   } else {
2029     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2030   }
2031   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2032     /*
2033       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2034       need to "disassemble" B -- convert it to using C's global indices.
2035       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2036 
2037       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2038       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2039 
2040       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2041       At least avoid calling MatSetValues() and the implied searches?
2042     */
2043 
2044     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2045 #if defined(PETSC_USE_CTABLE)
2046       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2047 #else
2048       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2049       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2050       if (aij->B) {
2051         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2052       }
2053 #endif
2054       ngcol = 0;
2055       if (aij->lvec) {
2056 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2057       }
2058       if (aij->garray) {
2059 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2060 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2061       }
2062       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2063       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2064     }
2065     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2066       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2067     }
2068     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2069       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2070     }
2071   }
2072   Bdisassembled = PETSC_FALSE;
2073   if (!aij->B) {
2074     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2075     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2076     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2077     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2078     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2079     Bdisassembled = PETSC_TRUE;
2080   }
2081   if (B) {
2082     Baij = (Mat_SeqAIJ*)(B->data);
2083     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2084       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2085       for (i=0; i<B->rmap->n; i++) {
2086 	nz[i] = Baij->i[i+1] - Baij->i[i];
2087       }
2088       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2089       ierr = PetscFree(nz);CHKERRQ(ierr);
2090     }
2091 
2092     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
2093     shift = rend-rstart;
2094     count = 0;
2095     rowindices = NULL;
2096     colindices = NULL;
2097     if (rowemb) {
2098       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
2099     }
2100     if (ocolemb) {
2101       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
2102     }
2103     for (i=0; i<B->rmap->n; i++) {
2104       PetscInt row;
2105       row = i;
2106       if (rowindices) row = rowindices[i];
2107       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2108 	col  = Baij->j[count];
2109 	if (colindices) col = colindices[col];
2110 	if (Bdisassembled && col>=rstart) col += shift;
2111 	v    = Baij->a[count];
2112 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
2113 	++count;
2114       }
2115     }
2116     /* No assembly for aij->B is necessary. */
2117     /* FIXME: set aij->B's nonzerostate correctly. */
2118   } else {
2119     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
2120   }
2121   C->preallocated  = PETSC_TRUE;
2122   C->was_assembled = PETSC_FALSE;
2123   C->assembled     = PETSC_FALSE;
2124    /*
2125       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2126       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2127    */
2128   PetscFunctionReturn(0);
2129 }
2130 
2131 #undef __FUNCT__
2132 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
2133 /*
2134   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2135  */
2136 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
2137 {
2138   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
2139 
2140   PetscFunctionBegin;
2141   PetscValidPointer(A,2);
2142   PetscValidPointer(B,3);
2143   /* FIXME: make sure C is assembled */
2144   *A = aij->A;
2145   *B = aij->B;
2146   /* Note that we don't incref *A and *B, so be careful! */
2147   PetscFunctionReturn(0);
2148 }
2149 
2150 /*
2151   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2152   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2153 */
2154 #undef __FUNCT__
2155 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
2156 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
2157                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
2158 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
2159 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
2160 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
2161 {
2162   PetscErrorCode ierr;
2163   PetscMPIInt    isize,flag;
2164   PetscInt       i,ii,cismax,ispar;
2165   Mat            *A,*B;
2166   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
2167 
2168   PetscFunctionBegin;
2169   if (!ismax) PetscFunctionReturn(0);
2170 
2171   for (i = 0, cismax = 0; i < ismax; ++i) {
2172     PetscMPIInt isize;
2173     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
2174     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2175     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
2176     if (isize > 1) ++cismax;
2177   }
2178   /*
2179      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
2180      ispar counts the number of parallel ISs across C's comm.
2181   */
2182   ierr = MPI_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2183   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
2184     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2185     PetscFunctionReturn(0);
2186   }
2187 
2188   /* if (ispar) */
2189   /*
2190     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
2191     These are used to extract the off-diag portion of the resulting parallel matrix.
2192     The row IS for the off-diag portion is the same as for the diag portion,
2193     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
2194   */
2195   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
2196   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
2197   for (i = 0, ii = 0; i < ismax; ++i) {
2198     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2199     if (isize > 1) {
2200       /*
2201 	 TODO: This is the part that's ***NOT SCALABLE***.
2202 	 To fix this we need to extract just the indices of C's nonzero columns
2203 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
2204 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
2205 	 to be done without serializing on the IS list, so, most likely, it is best
2206 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
2207       */
2208       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
2209       /* Now we have to
2210 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
2211 	     were sorted on each rank, concatenated they might no longer be sorted;
2212 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
2213 	     indices in the nondecreasing order to the original index positions.
2214 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
2215       */
2216       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
2217       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
2218       ++ii;
2219     }
2220   }
2221   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
2222   for (i = 0, ii = 0; i < ismax; ++i) {
2223     PetscInt       j,issize;
2224     const PetscInt *indices;
2225 
2226     /*
2227        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
2228      */
2229     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
2230     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
2231     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
2232     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
2233     for (j = 1; j < issize; ++j) {
2234       if (indices[j] == indices[j-1]) {
2235 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2236       }
2237     }
2238     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
2239 
2240 
2241     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
2242     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
2243     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
2244     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
2245     for (j = 1; j < issize; ++j) {
2246       if (indices[j-1] == indices[j]) {
2247 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2248       }
2249     }
2250     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
2251     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2252     if (isize > 1) {
2253       cisrow[ii] = isrow[i];
2254       ++ii;
2255     }
2256   }
2257   /*
2258     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2259     array of sequential matrices underlying the resulting parallel matrices.
2260     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
2261     contain duplicates.
2262 
2263     There are as many diag matrices as there are original index sets. There are only as many parallel
2264     and off-diag matrices, as there are parallel (comm size > 1) index sets.
2265 
2266     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
2267     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
2268       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
2269       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
2270       with A[i] and B[ii] extracted from the corresponding MPI submat.
2271     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
2272       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
2273       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
2274       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
2275       values into A[i] and B[ii] sitting inside the corresponding submat.
2276     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
2277       A[i], B[ii], AA[i] or BB[ii] matrices.
2278   */
2279   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
2280   if (scall == MAT_INITIAL_MATRIX) {
2281     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
2282     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
2283   } else {
2284     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
2285     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
2286     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
2287     for (i = 0, ii = 0; i < ismax; ++i) {
2288       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2289       if (isize > 1) {
2290 	Mat AA,BB;
2291 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
2292 	if (!isrow_p[i] && !iscol_p[i]) {
2293 	  A[i] = AA;
2294 	} else {
2295 	  /* TODO: extract A[i] composed on AA. */
2296 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
2297 	}
2298 	if (!isrow_p[i] && !ciscol_p[ii]) {
2299 	  B[ii] = BB;
2300 	} else {
2301 	  /* TODO: extract B[ii] composed on BB. */
2302 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
2303 	}
2304 	++ii;
2305       } else {
2306 	if (!isrow_p[i] && !iscol_p[i]) {
2307 	  A[i] = (*submat)[i];
2308 	} else {
2309 	  /* TODO: extract A[i] composed on (*submat)[i]. */
2310 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
2311 	}
2312       }
2313     }
2314   }
2315   /* Now obtain the sequential A and B submatrices separately. */
2316   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
2317   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
2318   /*
2319     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
2320     matrices A & B have been extracted directly into the parallel matrices containing them, or
2321     simply into the sequential matrix identical with the corresponding A (if isize == 1).
2322     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
2323     to have the same sparsity pattern.
2324     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
2325     must be constructed for C. This is done by setseqmat(s).
2326   */
2327   for (i = 0, ii = 0; i < ismax; ++i) {
2328     /*
2329        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
2330        That way we can avoid sorting and computing permutations when reusing.
2331        To this end:
2332         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
2333 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
2334 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
2335     */
2336     MatStructure pattern;
2337     if (scall == MAT_INITIAL_MATRIX) {
2338       pattern = DIFFERENT_NONZERO_PATTERN;
2339     } else {
2340       pattern = SAME_NONZERO_PATTERN;
2341     }
2342     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2343     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
2344     if (isize > 1) {
2345       if (scall == MAT_INITIAL_MATRIX) {
2346 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
2347 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2348 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
2349 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
2350 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
2351       }
2352       /*
2353 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
2354       */
2355       {
2356 	Mat AA,BB;
2357 	AA = NULL;
2358 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
2359 	BB = NULL;
2360 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
2361 	if (AA || BB) {
2362 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
2363 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2364 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2365 	}
2366 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
2367 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
2368 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
2369 	}
2370 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
2371 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
2372 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
2373 	}
2374       }
2375       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
2376       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
2377       ++ii;
2378     } else { /* if (isize == 1) */
2379       if (scall == MAT_INITIAL_MATRIX) {
2380 	if (isrow_p[i] || iscol_p[i]) {
2381 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
2382 	} else (*submat)[i] = A[i];
2383       }
2384       if (isrow_p[i] || iscol_p[i]) {
2385 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
2386 	/* Otherwise A is extracted straight into (*submats)[i]. */
2387 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
2388 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
2389       }
2390     }
2391     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
2392     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
2393   }
2394   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
2395   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
2396   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
2397   ierr = PetscFree(A);CHKERRQ(ierr);
2398   ierr = PetscFree(B);CHKERRQ(ierr);
2399   PetscFunctionReturn(0);
2400 }
2401 
2402 
2403 
2404 #undef __FUNCT__
2405 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
2406 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2407 {
2408   PetscErrorCode ierr;
2409 
2410   PetscFunctionBegin;
2411   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
2412   PetscFunctionReturn(0);
2413 }
2414