xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision bc99ddf557d174fb7f56c45fecd85e966ac78361)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1038 /*
1039     Every processor gets the entire matrix
1040 */
1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1042 {
1043   Mat            B;
1044   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1045   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1046   PetscErrorCode ierr;
1047   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1048   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1049   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1050   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1051 
1052   PetscFunctionBegin;
1053   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1054   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1055 
1056   if (scall == MAT_INITIAL_MATRIX) {
1057     /* ----------------------------------------------------------------
1058          Tell every processor the number of nonzeros per row
1059     */
1060     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1061     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1062       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1063     }
1064     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1065     for (i=0; i<size; i++) {
1066       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1067       displs[i]     = A->rmap->range[i];
1068     }
1069 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1070     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1071 #else
1072     sendcount = A->rmap->rend - A->rmap->rstart;
1073     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1074 #endif
1075     /* ---------------------------------------------------------------
1076          Create the sequential matrix of the same type as the local block diagonal
1077     */
1078     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1079     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1080     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1081     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1082     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1083     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1084     **Bin = B;
1085     b     = (Mat_SeqAIJ*)B->data;
1086 
1087     /*--------------------------------------------------------------------
1088        Copy my part of matrix column indices over
1089     */
1090     sendcount  = ad->nz + bd->nz;
1091     jsendbuf   = b->j + b->i[rstarts[rank]];
1092     a_jsendbuf = ad->j;
1093     b_jsendbuf = bd->j;
1094     n          = A->rmap->rend - A->rmap->rstart;
1095     cnt        = 0;
1096     for (i=0; i<n; i++) {
1097 
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     sendcount = ad->nz + bd->nz;
1161     sendbuf   = b->a + b->i[rstarts[rank]];
1162     a_sendbuf = ad->a;
1163     b_sendbuf = bd->a;
1164     b_sendj   = bd->j;
1165     n         = A->rmap->rend - A->rmap->rstart;
1166     cnt       = 0;
1167     for (i=0; i<n; i++) {
1168 
1169       /* put in lower diagonal portion */
1170       m = bd->i[i+1] - bd->i[i];
1171       while (m > 0) {
1172         /* is it above diagonal (in bd (compressed) numbering) */
1173         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1174         sendbuf[cnt++] = *b_sendbuf++;
1175         m--;
1176         b_sendj++;
1177       }
1178 
1179       /* put in diagonal portion */
1180       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1181         sendbuf[cnt++] = *a_sendbuf++;
1182       }
1183 
1184       /* put in upper diagonal portion */
1185       while (m-- > 0) {
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         b_sendj++;
1188       }
1189     }
1190     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1191 
1192     /* -----------------------------------------------------------------
1193        Gather all numerical values to all processors
1194     */
1195     if (!recvcounts) {
1196       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1197     }
1198     for (i=0; i<size; i++) {
1199       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1200     }
1201     displs[0] = 0;
1202     for (i=1; i<size; i++) {
1203       displs[i] = displs[i-1] + recvcounts[i-1];
1204     }
1205     recvbuf = b->a;
1206 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1207     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1208 #else
1209     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1210 #endif
1211   }  /* endof (flag == MAT_GET_VALUES) */
1212   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1213 
1214   if (A->symmetric) {
1215     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1216   } else if (A->hermitian) {
1217     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1218   } else if (A->structurally_symmetric) {
1219     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1220   }
1221   PetscFunctionReturn(0);
1222 }
1223 
1224 PetscErrorCode MatDestroy_Dummy_MatGetSubmatrices(Mat C)
1225 {
1226   PetscErrorCode ierr;
1227   Mat_SubMat     *submatj = (Mat_SubMat*)C->data;
1228   PetscInt       i;
1229 
1230   PetscFunctionBegin;
1231   if (!submatj->id) { /* delete data that are linked only to submats[id=0] */
1232     ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1233 
1234     for (i=0; i<submatj->nrqr; ++i) {
1235       ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1236     }
1237     ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1238 
1239     if (submatj->rbuf1) {
1240       ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1241       ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1242     }
1243 
1244     for (i=0; i<submatj->nrqs; ++i) {
1245       ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1246     }
1247     ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1248     ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1249   }
1250 
1251 #if defined(PETSC_USE_CTABLE)
1252   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1253   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1254   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1255 #else
1256   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1257 #endif
1258 
1259   if (!submatj->allcolumns) {
1260 #if defined(PETSC_USE_CTABLE)
1261     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1262 #else
1263     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1264 #endif
1265   }
1266   ierr = submatj->destroy(C);CHKERRQ(ierr);
1267   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1268 
1269   ierr = PetscFree(submatj);CHKERRQ(ierr);
1270   PetscFunctionReturn(0);
1271 }
1272 
1273 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1274 {
1275   PetscErrorCode ierr;
1276   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1277   Mat_SubMat     *submatj = c->submatis1;
1278   PetscInt       i;
1279 
1280   PetscFunctionBegin;
1281   if (!submatj->id) { /* delete data that are linked only to submats[id=0] */
1282     ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1283 
1284     for (i=0; i<submatj->nrqr; ++i) {
1285       ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1286     }
1287     ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1288 
1289     if (submatj->rbuf1) {
1290       ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1291       ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1292     }
1293 
1294     for (i=0; i<submatj->nrqs; ++i) {
1295       ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1296     }
1297     ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1298     ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1299   }
1300 
1301 #if defined(PETSC_USE_CTABLE)
1302   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1303   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1304   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1305 #else
1306   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1307 #endif
1308 
1309   if (!submatj->allcolumns) {
1310 #if defined(PETSC_USE_CTABLE)
1311     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1312 #else
1313     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1314 #endif
1315   }
1316   ierr = submatj->destroy(C);CHKERRQ(ierr);
1317   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1318 
1319   ierr = PetscFree(submatj);CHKERRQ(ierr);
1320   PetscFunctionReturn(0);
1321 }
1322 
1323 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1324 {
1325   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1326   Mat            submat,A = c->A,B = c->B;
1327   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1328   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1329   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1330   const PetscInt *icol,*irow;
1331   PetscInt       nrow,ncol,start;
1332   PetscErrorCode ierr;
1333   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1334   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1335   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1336   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1337   PetscInt       *lens,rmax,ncols,*cols,Crow;
1338 #if defined(PETSC_USE_CTABLE)
1339   PetscTable     cmap,rmap;
1340   PetscInt       *cmap_loc,*rmap_loc;
1341 #else
1342   PetscInt       *cmap,*rmap;
1343 #endif
1344   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1345   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1346   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1347   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1348   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1349   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1350   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1351   MPI_Comm       comm;
1352   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1353   PetscMPIInt    *onodes1,*olengths1,idex,end;
1354   Mat_SubMat     *smatis1;
1355   PetscBool      isrowsorted;
1356 
1357   PetscFunctionBegin;
1358   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1359 
1360   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1361   size = c->size;
1362   rank = c->rank;
1363 
1364   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1365   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1366 
1367   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1368   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1369   if (allcolumns) {
1370     icol = NULL;
1371     ncol = C->cmap->N;
1372   } else {
1373     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1374     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1375   }
1376 
1377   if (scall == MAT_INITIAL_MATRIX) {
1378     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1379 
1380     /* Get some new tags to keep the communication clean */
1381     tag1 = ((PetscObject)C)->tag;
1382     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1383     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1384 
1385     /* evaluate communication - mesg to who, length of mesg, and buffer space
1386      required. Based on this, buffers are allocated, and data copied into them */
1387     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1388     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1389 
1390     /* w1[proc] = num of rows owned by proc -- to be requested */
1391     proc = 0;
1392     nrqs = 0; /* num of outgoing messages */
1393     for (j=0; j<nrow; j++) {
1394       row  = irow[j]; /* sorted! */
1395       while (row >= C->rmap->range[proc+1]) proc++;
1396       w1[proc]++;
1397       row2proc[j] = proc; /* map row index to proc */
1398 
1399       if (proc != rank && !w2[proc]) {
1400         w2[proc] = 1; nrqs++;
1401       }
1402     }
1403     w1[rank] = 0;  /* rows owned by self will not be requested */
1404 
1405     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1406     for (proc=0,j=0; proc<size; proc++) {
1407       if (w1[proc]) { pa[j++] = proc;}
1408     }
1409 
1410     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1411     msz = 0;              /* total mesg length (for all procs) */
1412     for (i=0; i<nrqs; i++) {
1413       proc      = pa[i];
1414       w1[proc] += 3;
1415       msz      += w1[proc];
1416     }
1417     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1418 
1419     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1420     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1421     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1422 
1423     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1424        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1425     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1426 
1427     /* Now post the Irecvs corresponding to these messages */
1428     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1429 
1430     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1431     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1432 
1433     /* Allocate Memory for outgoing messages */
1434     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1435     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1436     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1437 
1438     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1439     iptr = tmp;
1440     for (i=0; i<nrqs; i++) {
1441       proc        = pa[i];
1442       sbuf1[proc] = iptr;
1443       iptr       += w1[proc];
1444     }
1445 
1446     /* Form the outgoing messages */
1447     /* Initialize the header space */
1448     for (i=0; i<nrqs; i++) {
1449       proc      = pa[i];
1450       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1451       ptr[proc] = sbuf1[proc] + 3;
1452     }
1453 
1454     /* Parse the isrow and copy data into outbuf */
1455     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1456     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1457       proc = row2proc[j];
1458       if (proc != rank) { /* copy to the outgoing buf*/
1459         *ptr[proc] = irow[j];
1460         ctr[proc]++; ptr[proc]++;
1461       }
1462     }
1463 
1464     /* Update the headers for the current IS */
1465     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1466       if ((ctr_j = ctr[j])) {
1467         sbuf1_j        = sbuf1[j];
1468         k              = ++sbuf1_j[0];
1469         sbuf1_j[2*k]   = ctr_j;
1470         sbuf1_j[2*k-1] = 0;
1471       }
1472     }
1473 
1474     /* Now post the sends */
1475     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1476     for (i=0; i<nrqs; ++i) {
1477       proc = pa[i];
1478       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1479     }
1480 
1481     /* Post Receives to capture the buffer size */
1482     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1483     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1484 
1485     rbuf2[0] = tmp + msz;
1486     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1487 
1488     for (i=0; i<nrqs; ++i) {
1489       proc = pa[i];
1490       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1491     }
1492 
1493     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1494 
1495     /* Send to other procs the buf size they should allocate */
1496     /* Receive messages*/
1497     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1498     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1499 
1500     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1501     for (i=0; i<nrqr; ++i) {
1502       req_size[i] = 0;
1503       rbuf1_i        = rbuf1[i];
1504       start          = 2*rbuf1_i[0] + 1;
1505       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1506       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1507       sbuf2_i        = sbuf2[i];
1508       for (j=start; j<end; j++) {
1509         k            = rbuf1_i[j] - rstart;
1510         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1511         sbuf2_i[j]   = ncols;
1512         req_size[i] += ncols;
1513       }
1514       req_source1[i] = r_status1[i].MPI_SOURCE;
1515 
1516       /* form the header */
1517       sbuf2_i[0] = req_size[i];
1518       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1519 
1520       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1521     }
1522 
1523     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1524     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1525 
1526     /* rbuf2 is received, Post recv column indices a->j */
1527     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1528 
1529     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1530     for (i=0; i<nrqs; ++i) {
1531       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1532       req_source2[i] = r_status2[i].MPI_SOURCE;
1533       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1534     }
1535 
1536     /* Wait on sends1 and sends2 */
1537     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1538     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1539     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1540     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1541 
1542     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1543     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1544 
1545     /* Now allocate sending buffers for a->j, and send them off */
1546     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1547     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1548     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1549     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1550 
1551     for (i=0; i<nrqr; i++) { /* for each requested message */
1552       rbuf1_i   = rbuf1[i];
1553       sbuf_aj_i = sbuf_aj[i];
1554       ct1       = 2*rbuf1_i[0] + 1;
1555       ct2       = 0;
1556       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1557 
1558       kmax = rbuf1[i][2];
1559       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1560         row    = rbuf1_i[ct1] - rstart;
1561         nzA    = ai[row+1] - ai[row];
1562         nzB    = bi[row+1] - bi[row];
1563         ncols  = nzA + nzB;
1564         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1565 
1566         /* load the column indices for this row into cols*/
1567         cols = sbuf_aj_i + ct2;
1568 
1569         lwrite = 0;
1570         for (l=0; l<nzB; l++) {
1571           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1572         }
1573         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1574         for (l=0; l<nzB; l++) {
1575           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1576         }
1577 
1578         ct2 += ncols;
1579       }
1580       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1581     }
1582 
1583     /* create column map (cmap): global col of C -> local col of submat */
1584 #if defined(PETSC_USE_CTABLE)
1585     if (!allcolumns) {
1586       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1587       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1588       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1589         if (icol[j] >= cstart && icol[j] <cend) {
1590           cmap_loc[icol[j] - cstart] = j+1;
1591         } else { /* use PetscTable for non-local col indices */
1592           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1593         }
1594       }
1595     } else {
1596       cmap     = NULL;
1597       cmap_loc = NULL;
1598     }
1599     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1600 #else
1601     if (!allcolumns) {
1602       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1603       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1604     } else {
1605       cmap = NULL;
1606     }
1607 #endif
1608 
1609     /* Create lens for MatSeqAIJSetPreallocation() */
1610     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1611 
1612     /* Compute lens from local part of C */
1613     for (j=0; j<nrow; j++) {
1614       row  = irow[j];
1615       proc = row2proc[j];
1616       if (proc == rank) {
1617         /* diagonal part A = c->A */
1618         ncols = ai[row-rstart+1] - ai[row-rstart];
1619         cols  = aj + ai[row-rstart];
1620         if (!allcolumns) {
1621           for (k=0; k<ncols; k++) {
1622 #if defined(PETSC_USE_CTABLE)
1623             tcol = cmap_loc[cols[k]];
1624 #else
1625             tcol = cmap[cols[k]+cstart];
1626 #endif
1627             if (tcol) lens[j]++;
1628           }
1629         } else { /* allcolumns */
1630           lens[j] = ncols;
1631         }
1632 
1633         /* off-diagonal part B = c->B */
1634         ncols = bi[row-rstart+1] - bi[row-rstart];
1635         cols  = bj + bi[row-rstart];
1636         if (!allcolumns) {
1637           for (k=0; k<ncols; k++) {
1638 #if defined(PETSC_USE_CTABLE)
1639             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1640 #else
1641             tcol = cmap[bmap[cols[k]]];
1642 #endif
1643             if (tcol) lens[j]++;
1644           }
1645         } else { /* allcolumns */
1646           lens[j] += ncols;
1647         }
1648       }
1649     }
1650 
1651     /* Create row map (rmap): global row of C -> local row of submat */
1652 #if defined(PETSC_USE_CTABLE)
1653     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1654     for (j=0; j<nrow; j++) {
1655       row  = irow[j];
1656       proc = row2proc[j];
1657       if (proc == rank) { /* a local row */
1658         rmap_loc[row - rstart] = j;
1659       } else {
1660         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1661       }
1662     }
1663 #else
1664     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1665     for (j=0; j<nrow; j++) {
1666       rmap[irow[j]] = j;
1667     }
1668 #endif
1669 
1670     /* Update lens from offproc data */
1671     /* recv a->j is done */
1672     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1673     for (i=0; i<nrqs; i++) {
1674       proc    = pa[i];
1675       sbuf1_i = sbuf1[proc];
1676       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1677       ct1     = 2 + 1;
1678       ct2     = 0;
1679       rbuf2_i = rbuf2[i]; /* received length of C->j */
1680       rbuf3_i = rbuf3[i]; /* received C->j */
1681 
1682       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1683       max1   = sbuf1_i[2];
1684       for (k=0; k<max1; k++,ct1++) {
1685 #if defined(PETSC_USE_CTABLE)
1686         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1687         row--;
1688         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1689 #else
1690         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1691 #endif
1692         /* Now, store row index of submat in sbuf1_i[ct1] */
1693         sbuf1_i[ct1] = row;
1694 
1695         nnz = rbuf2_i[ct1];
1696         if (!allcolumns) {
1697           for (l=0; l<nnz; l++,ct2++) {
1698 #if defined(PETSC_USE_CTABLE)
1699             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1700               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1701             } else {
1702               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1703             }
1704 #else
1705             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1706 #endif
1707             if (tcol) lens[row]++;
1708           }
1709         } else { /* allcolumns */
1710           lens[row] += nnz;
1711         }
1712       }
1713     }
1714     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1715     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1716 
1717     /* Create the submatrices */
1718     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1719     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1720 
1721     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1722     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1723     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1724     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1725     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1726 
1727     /* create struct Mat_SubMat and attached it to submat */
1728     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1729     subc = (Mat_SeqAIJ*)submat->data;
1730     subc->submatis1 = smatis1;
1731 
1732     smatis1->id          = 0;
1733     smatis1->nrqs        = nrqs;
1734     smatis1->nrqr        = nrqr;
1735     smatis1->rbuf1       = rbuf1;
1736     smatis1->rbuf2       = rbuf2;
1737     smatis1->rbuf3       = rbuf3;
1738     smatis1->sbuf2       = sbuf2;
1739     smatis1->req_source2 = req_source2;
1740 
1741     smatis1->sbuf1       = sbuf1;
1742     smatis1->ptr         = ptr;
1743     smatis1->tmp         = tmp;
1744     smatis1->ctr         = ctr;
1745 
1746     smatis1->pa           = pa;
1747     smatis1->req_size     = req_size;
1748     smatis1->req_source1  = req_source1;
1749 
1750     smatis1->allcolumns  = allcolumns;
1751     smatis1->singleis    = PETSC_TRUE;
1752     smatis1->row2proc    = row2proc;
1753     smatis1->rmap        = rmap;
1754     smatis1->cmap        = cmap;
1755 #if defined(PETSC_USE_CTABLE)
1756     smatis1->rmap_loc    = rmap_loc;
1757     smatis1->cmap_loc    = cmap_loc;
1758 #endif
1759 
1760     smatis1->destroy     = submat->ops->destroy;
1761     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1762     submat->factortype   = C->factortype;
1763 
1764     /* compute rmax */
1765     rmax = 0;
1766     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1767 
1768   } else { /* scall == MAT_REUSE_MATRIX */
1769     submat = submats[0];
1770     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1771 
1772     subc    = (Mat_SeqAIJ*)submat->data;
1773     rmax    = subc->rmax;
1774     smatis1 = subc->submatis1;
1775     nrqs        = smatis1->nrqs;
1776     nrqr        = smatis1->nrqr;
1777     rbuf1       = smatis1->rbuf1;
1778     rbuf2       = smatis1->rbuf2;
1779     rbuf3       = smatis1->rbuf3;
1780     req_source2 = smatis1->req_source2;
1781 
1782     sbuf1     = smatis1->sbuf1;
1783     sbuf2     = smatis1->sbuf2;
1784     ptr       = smatis1->ptr;
1785     tmp       = smatis1->tmp;
1786     ctr       = smatis1->ctr;
1787 
1788     pa         = smatis1->pa;
1789     req_size   = smatis1->req_size;
1790     req_source1 = smatis1->req_source1;
1791 
1792     allcolumns = smatis1->allcolumns;
1793     row2proc   = smatis1->row2proc;
1794     rmap       = smatis1->rmap;
1795     cmap       = smatis1->cmap;
1796 #if defined(PETSC_USE_CTABLE)
1797     rmap_loc   = smatis1->rmap_loc;
1798     cmap_loc   = smatis1->cmap_loc;
1799 #endif
1800   }
1801 
1802   /* Post recv matrix values */
1803   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1804   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1805   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1806   for (i=0; i<nrqs; ++i) {
1807     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1808     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1809   }
1810 
1811   /* Allocate sending buffers for a->a, and send them off */
1812   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1813   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1814   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1815   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1816 
1817   for (i=0; i<nrqr; i++) {
1818     rbuf1_i   = rbuf1[i];
1819     sbuf_aa_i = sbuf_aa[i];
1820     ct1       = 2*rbuf1_i[0]+1;
1821     ct2       = 0;
1822     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1823 
1824     kmax = rbuf1_i[2];
1825     for (k=0; k<kmax; k++,ct1++) {
1826       row = rbuf1_i[ct1] - rstart;
1827       nzA = ai[row+1] - ai[row];
1828       nzB = bi[row+1] - bi[row];
1829       ncols  = nzA + nzB;
1830       cworkB = bj + bi[row];
1831       vworkA = a_a + ai[row];
1832       vworkB = b_a + bi[row];
1833 
1834       /* load the column values for this row into vals*/
1835       vals = sbuf_aa_i + ct2;
1836 
1837       lwrite = 0;
1838       for (l=0; l<nzB; l++) {
1839         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1840       }
1841       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1842       for (l=0; l<nzB; l++) {
1843         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1844       }
1845 
1846       ct2 += ncols;
1847     }
1848     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1849   }
1850 
1851   /* Assemble submat */
1852   /* First assemble the local rows */
1853   for (j=0; j<nrow; j++) {
1854     row  = irow[j];
1855     proc = row2proc[j];
1856     if (proc == rank) {
1857       Crow = row - rstart;  /* local row index of C */
1858 #if defined(PETSC_USE_CTABLE)
1859       row = rmap_loc[Crow]; /* row index of submat */
1860 #else
1861       row = rmap[row];
1862 #endif
1863 
1864       if (allcolumns) {
1865         /* diagonal part A = c->A */
1866         ncols = ai[Crow+1] - ai[Crow];
1867         cols  = aj + ai[Crow];
1868         vals  = a->a + ai[Crow];
1869         i     = 0;
1870         for (k=0; k<ncols; k++) {
1871           subcols[i]   = cols[k] + cstart;
1872           subvals[i++] = vals[k];
1873         }
1874 
1875         /* off-diagonal part B = c->B */
1876         ncols = bi[Crow+1] - bi[Crow];
1877         cols  = bj + bi[Crow];
1878         vals  = b->a + bi[Crow];
1879         for (k=0; k<ncols; k++) {
1880           subcols[i]   = bmap[cols[k]];
1881           subvals[i++] = vals[k];
1882         }
1883 
1884         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1885 
1886       } else { /* !allcolumns */
1887 #if defined(PETSC_USE_CTABLE)
1888         /* diagonal part A = c->A */
1889         ncols = ai[Crow+1] - ai[Crow];
1890         cols  = aj + ai[Crow];
1891         vals  = a->a + ai[Crow];
1892         i     = 0;
1893         for (k=0; k<ncols; k++) {
1894           tcol = cmap_loc[cols[k]];
1895           if (tcol) {
1896             subcols[i]   = --tcol;
1897             subvals[i++] = vals[k];
1898           }
1899         }
1900 
1901         /* off-diagonal part B = c->B */
1902         ncols = bi[Crow+1] - bi[Crow];
1903         cols  = bj + bi[Crow];
1904         vals  = b->a + bi[Crow];
1905         for (k=0; k<ncols; k++) {
1906           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1907           if (tcol) {
1908             subcols[i]   = --tcol;
1909             subvals[i++] = vals[k];
1910           }
1911         }
1912 #else
1913         /* diagonal part A = c->A */
1914         ncols = ai[Crow+1] - ai[Crow];
1915         cols  = aj + ai[Crow];
1916         vals  = a->a + ai[Crow];
1917         i     = 0;
1918         for (k=0; k<ncols; k++) {
1919           tcol = cmap[cols[k]+cstart];
1920           if (tcol) {
1921             subcols[i]   = --tcol;
1922             subvals[i++] = vals[k];
1923           }
1924         }
1925 
1926         /* off-diagonal part B = c->B */
1927         ncols = bi[Crow+1] - bi[Crow];
1928         cols  = bj + bi[Crow];
1929         vals  = b->a + bi[Crow];
1930         for (k=0; k<ncols; k++) {
1931           tcol = cmap[bmap[cols[k]]];
1932           if (tcol) {
1933             subcols[i]   = --tcol;
1934             subvals[i++] = vals[k];
1935           }
1936         }
1937 #endif
1938         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1939       }
1940     }
1941   }
1942 
1943   /* Now assemble the off-proc rows */
1944   for (i=0; i<nrqs; i++) { /* for each requested message */
1945     /* recv values from other processes */
1946     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1947     proc    = pa[idex];
1948     sbuf1_i = sbuf1[proc];
1949     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1950     ct1     = 2 + 1;
1951     ct2     = 0; /* count of received C->j */
1952     ct3     = 0; /* count of received C->j that will be inserted into submat */
1953     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1954     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1955     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1956 
1957     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1958     max1 = sbuf1_i[2];             /* num of rows */
1959     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1960       row = sbuf1_i[ct1]; /* row index of submat */
1961       if (!allcolumns) {
1962         idex = 0;
1963         if (scall == MAT_INITIAL_MATRIX) {
1964           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1965           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1966 #if defined(PETSC_USE_CTABLE)
1967             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1968               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1969             } else {
1970               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1971             }
1972 #else
1973             tcol = cmap[rbuf3_i[ct2]];
1974 #endif
1975             if (tcol) {
1976               subcols[idex]   = --tcol;
1977               subvals[idex++] = rbuf4_i[ct2];
1978 
1979               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1980                For reuse, we replace received C->j with index that should be inserted to submat */
1981               rbuf3_i[ct3++] = ct2;
1982             }
1983           }
1984           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1985 
1986         } else { /* scall == MAT_REUSE_MATRIX */
1987           submat = submats[0];
1988           subc   = (Mat_SeqAIJ*)submat->data;
1989 
1990           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1991           for (l=0; l<nnz; l++) {
1992             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1993             subvals[idex++] = rbuf4_i[ct2];
1994           }
1995 
1996           bj = subc->j + subc->i[row];
1997           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1998         }
1999       } else { /* allcolumns */
2000         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
2001         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
2002         ct2 += nnz;
2003       }
2004     }
2005   }
2006 
2007   /* sending a->a are done */
2008   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
2009   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
2010 
2011   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2012   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2013   submats[0] = submat;
2014 
2015   /* Restore the indices */
2016   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
2017   if (!allcolumns) {
2018     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
2019   }
2020 
2021   /* Destroy allocated memory */
2022   for (i=0; i<nrqs; ++i) {
2023     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
2024   }
2025   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
2026   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2027   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2028 
2029   if (scall == MAT_INITIAL_MATRIX) {
2030     ierr = PetscFree(lens);CHKERRQ(ierr);
2031     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2032     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2033   }
2034   PetscFunctionReturn(0);
2035 }
2036 
2037 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2038 {
2039   PetscErrorCode ierr;
2040   PetscInt       ncol;
2041   PetscBool      colflag,allcolumns=PETSC_FALSE;
2042 
2043   PetscFunctionBegin;
2044   /* Allocate memory to hold all the submatrices */
2045   if (scall == MAT_INITIAL_MATRIX) {
2046     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
2047   }
2048 
2049   /* Check for special case: each processor gets entire matrix columns */
2050   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
2051   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2052   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2053 
2054   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2055   PetscFunctionReturn(0);
2056 }
2057 
2058 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2059 {
2060   PetscErrorCode ierr;
2061   PetscInt       nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2];
2062   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
2063   Mat_SeqAIJ     *subc;
2064   Mat_SubMat     *smat;
2065 
2066   PetscFunctionBegin;
2067   /* Check for special case: each processor has a single IS */
2068   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
2069     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2070     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
2071     PetscFunctionReturn(0);
2072   }
2073 
2074   /* Collect global wantallmatrix and nstages */
2075   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
2076   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2077   if (!nmax) nmax = 1;
2078 
2079   if (scall == MAT_INITIAL_MATRIX) {
2080     /* Collect global wantallmatrix and nstages */
2081     if (ismax == 1 && C->rmap->N == C->cmap->N) {
2082       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2083       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2084       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2085       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2086       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2087         wantallmatrix = PETSC_TRUE;
2088 
2089         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2090       }
2091     }
2092 
2093     /* Determine the number of stages through which submatrices are done
2094        Each stage will extract nmax submatrices.
2095        nmax is determined by the matrix column dimension.
2096        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2097     */
2098     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2099 
2100     in[0] = -1*(PetscInt)wantallmatrix;
2101     in[1] = nstages;
2102     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2103     wantallmatrix = (PetscBool)(-out[0]);
2104     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2105 
2106   } else { /* MAT_REUSE_MATRIX */
2107     if (ismax) {
2108       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2109       smat = subc->submatis1;
2110     } else { /* (*submat)[0] is a dummy matrix */
2111       smat = (Mat_SubMat*)(*submat)[0]->data;
2112     }
2113     if (!smat) {
2114       /* smat is not generated by MatGetSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2115       wantallmatrix = PETSC_TRUE;
2116     } else if (smat->singleis) {
2117       ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2118       PetscFunctionReturn(0);
2119     } else {
2120       nstages = smat->nstages;
2121     }
2122   }
2123 
2124   if (wantallmatrix) {
2125     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2126     PetscFunctionReturn(0);
2127   }
2128 
2129   /* Allocate memory to hold all the submatrices and dummy submatrices */
2130   if (scall == MAT_INITIAL_MATRIX) {
2131     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2132   }
2133 
2134 #if 0
2135   PetscMPIInt rank;
2136   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)C),&rank);CHKERRQ(ierr);
2137   printf("[%d] reuse %d, ismax %d, CN %d, nstages %d, nmax %d\n",rank,scall,ismax,C->cmap->N,nstages,nmax);
2138 #endif
2139   for (i=0,pos=0; i<nstages; i++) {
2140     if (pos+nmax <= ismax) max_no = nmax;
2141     else if (pos == ismax) max_no = 0;
2142     else                   max_no = ismax-pos;
2143     /* if (!max_no) printf("[%d] max_no=0, %d-stage\n",rank,i); */
2144     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2145     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2146       smat = (Mat_SubMat*)(*submat)[pos]->data;
2147       smat->nstages = nstages;
2148     }
2149     pos += max_no;
2150   }
2151 
2152   if (ismax && scall == MAT_INITIAL_MATRIX) {
2153     /* save nstages for reuse */
2154     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2155     smat = subc->submatis1;
2156     smat->nstages = nstages;
2157   }
2158   PetscFunctionReturn(0);
2159 }
2160 
2161 /* -------------------------------------------------------------------------*/
2162 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2163 {
2164   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2165   Mat            A  = c->A;
2166   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2167   const PetscInt **icol,**irow;
2168   PetscInt       *nrow,*ncol,start;
2169   PetscErrorCode ierr;
2170   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2171   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2172   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2173   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2174   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2175 #if defined(PETSC_USE_CTABLE)
2176   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2177 #else
2178   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2179 #endif
2180   const PetscInt *irow_i;
2181   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2182   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2183   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2184   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2185   MPI_Status     *r_status3,*r_status4,*s_status4;
2186   MPI_Comm       comm;
2187   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2188   PetscMPIInt    *onodes1,*olengths1,end;
2189   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2190   Mat_SubMat     *smat_i;
2191   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2192   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2193 
2194   PetscFunctionBegin;
2195   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2196   size = c->size;
2197   rank = c->rank;
2198 
2199   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2200   ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2201 
2202   for (i=0; i<ismax; i++) {
2203     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2204     if (!issorted[i]) iscsorted = issorted[i];
2205 
2206     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2207 
2208     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2209     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2210 
2211     /* Check for special case: allcolumn */
2212     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2213     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2214     if (colflag && ncol[i] == C->cmap->N) {
2215       allcolumns[i] = PETSC_TRUE;
2216       icol[i] = NULL;
2217     } else {
2218       allcolumns[i] = PETSC_FALSE;
2219       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2220     }
2221   }
2222 
2223   if (scall == MAT_REUSE_MATRIX) {
2224     /* Assumes new rows are same length as the old rows */
2225     for (i=0; i<ismax; i++) {
2226       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2227       subc = (Mat_SeqAIJ*)submats[i]->data;
2228       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2229 
2230       /* Initial matrix as if empty */
2231       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2232 
2233       smat_i   = subc->submatis1;
2234 
2235       nrqs        = smat_i->nrqs;
2236       nrqr        = smat_i->nrqr;
2237       rbuf1       = smat_i->rbuf1;
2238       rbuf2       = smat_i->rbuf2;
2239       rbuf3       = smat_i->rbuf3;
2240       req_source2 = smat_i->req_source2;
2241 
2242       sbuf1     = smat_i->sbuf1;
2243       sbuf2     = smat_i->sbuf2;
2244       ptr       = smat_i->ptr;
2245       tmp       = smat_i->tmp;
2246       ctr       = smat_i->ctr;
2247 
2248       pa          = smat_i->pa;
2249       req_size    = smat_i->req_size;
2250       req_source1 = smat_i->req_source1;
2251 
2252       allcolumns[i] = smat_i->allcolumns;
2253       row2proc[i]   = smat_i->row2proc;
2254       rmap[i]       = smat_i->rmap;
2255       cmap[i]       = smat_i->cmap;
2256     }
2257 
2258     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2259       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2260       smat_i = (Mat_SubMat*)submats[0]->data;
2261 
2262       nrqs        = smat_i->nrqs;
2263       nrqr        = smat_i->nrqr;
2264       rbuf1       = smat_i->rbuf1;
2265       rbuf2       = smat_i->rbuf2;
2266       rbuf3       = smat_i->rbuf3;
2267       req_source2 = smat_i->req_source2;
2268 
2269       sbuf1       = smat_i->sbuf1;
2270       sbuf2       = smat_i->sbuf2;
2271       ptr         = smat_i->ptr;
2272       tmp         = smat_i->tmp;
2273       ctr         = smat_i->ctr;
2274 
2275       pa          = smat_i->pa;
2276       req_size    = smat_i->req_size;
2277       req_source1 = smat_i->req_source1;
2278 
2279       allcolumns[0] = PETSC_TRUE;
2280     }
2281   } else { /* scall == MAT_INITIAL_MATRIX */
2282     /* Get some new tags to keep the communication clean */
2283     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2284     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2285 
2286     /* evaluate communication - mesg to who, length of mesg, and buffer space
2287      required. Based on this, buffers are allocated, and data copied into them*/
2288     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2289 
2290     for (i=0; i<ismax; i++) {
2291       jmax   = nrow[i];
2292       irow_i = irow[i];
2293 
2294       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2295       row2proc[i] = row2proc_i;
2296 
2297       if (issorted[i]) proc = 0;
2298       for (j=0; j<jmax; j++) {
2299         if (!issorted[i]) proc = 0;
2300         row = irow_i[j];
2301         while (row >= C->rmap->range[proc+1]) proc++;
2302         w4[proc]++;
2303         row2proc_i[j] = proc; /* map row index to proc */
2304       }
2305       for (j=0; j<size; j++) {
2306         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2307       }
2308     }
2309 
2310     nrqs     = 0;              /* no of outgoing messages */
2311     msz      = 0;              /* total mesg length (for all procs) */
2312     w1[rank] = 0;              /* no mesg sent to self */
2313     w3[rank] = 0;
2314     for (i=0; i<size; i++) {
2315       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2316     }
2317     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2318     for (i=0,j=0; i<size; i++) {
2319       if (w1[i]) { pa[j] = i; j++; }
2320     }
2321 
2322     /* Each message would have a header = 1 + 2*(no of IS) + data */
2323     for (i=0; i<nrqs; i++) {
2324       j      = pa[i];
2325       w1[j] += w2[j] + 2* w3[j];
2326       msz   += w1[j];
2327     }
2328     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2329 
2330     /* Determine the number of messages to expect, their lengths, from from-ids */
2331     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2332     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2333 
2334     /* Now post the Irecvs corresponding to these messages */
2335     tag0 = ((PetscObject)C)->tag;
2336     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2337 
2338     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2339     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2340 
2341     /* Allocate Memory for outgoing messages */
2342     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2343     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2344     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2345 
2346     {
2347       PetscInt *iptr = tmp;
2348       k    = 0;
2349       for (i=0; i<nrqs; i++) {
2350         j        = pa[i];
2351         iptr    += k;
2352         sbuf1[j] = iptr;
2353         k        = w1[j];
2354       }
2355     }
2356 
2357     /* Form the outgoing messages. Initialize the header space */
2358     for (i=0; i<nrqs; i++) {
2359       j           = pa[i];
2360       sbuf1[j][0] = 0;
2361       ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2362       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2363     }
2364 
2365     /* Parse the isrow and copy data into outbuf */
2366     for (i=0; i<ismax; i++) {
2367       row2proc_i = row2proc[i];
2368       ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2369       irow_i = irow[i];
2370       jmax   = nrow[i];
2371       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2372         proc = row2proc_i[j];
2373         if (proc != rank) { /* copy to the outgoing buf*/
2374           ctr[proc]++;
2375           *ptr[proc] = irow_i[j];
2376           ptr[proc]++;
2377         }
2378       }
2379       /* Update the headers for the current IS */
2380       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2381         if ((ctr_j = ctr[j])) {
2382           sbuf1_j        = sbuf1[j];
2383           k              = ++sbuf1_j[0];
2384           sbuf1_j[2*k]   = ctr_j;
2385           sbuf1_j[2*k-1] = i;
2386         }
2387       }
2388     }
2389 
2390     /*  Now  post the sends */
2391     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2392     for (i=0; i<nrqs; ++i) {
2393       j    = pa[i];
2394       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2395     }
2396 
2397     /* Post Receives to capture the buffer size */
2398     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2399     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2400     rbuf2[0] = tmp + msz;
2401     for (i=1; i<nrqs; ++i) {
2402       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2403     }
2404     for (i=0; i<nrqs; ++i) {
2405       j    = pa[i];
2406       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2407     }
2408 
2409     /* Send to other procs the buf size they should allocate */
2410     /* Receive messages*/
2411     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2412     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2413     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2414     {
2415       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2416       PetscInt   *sbuf2_i;
2417 
2418       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2419       for (i=0; i<nrqr; ++i) {
2420         req_size[i] = 0;
2421         rbuf1_i        = rbuf1[i];
2422         start          = 2*rbuf1_i[0] + 1;
2423         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2424         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2425         sbuf2_i        = sbuf2[i];
2426         for (j=start; j<end; j++) {
2427           id              = rbuf1_i[j] - rstart;
2428           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2429           sbuf2_i[j]      = ncols;
2430           req_size[i] += ncols;
2431         }
2432         req_source1[i] = r_status1[i].MPI_SOURCE;
2433         /* form the header */
2434         sbuf2_i[0] = req_size[i];
2435         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2436 
2437         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2438       }
2439     }
2440     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2441     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2442     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2443 
2444     /* Receive messages*/
2445     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2446     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2447 
2448     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2449     for (i=0; i<nrqs; ++i) {
2450       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2451       req_source2[i] = r_status2[i].MPI_SOURCE;
2452       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2453     }
2454     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2455     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2456 
2457     /* Wait on sends1 and sends2 */
2458     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2459     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2460 
2461     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2462     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2463     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2464     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2465     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2466     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2467 
2468     /* Now allocate sending buffers for a->j, and send them off */
2469     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2470     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2471     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2472     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2473 
2474     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2475     {
2476       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2477       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2478       PetscInt cend = C->cmap->rend;
2479       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2480 
2481       for (i=0; i<nrqr; i++) {
2482         rbuf1_i   = rbuf1[i];
2483         sbuf_aj_i = sbuf_aj[i];
2484         ct1       = 2*rbuf1_i[0] + 1;
2485         ct2       = 0;
2486         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2487           kmax = rbuf1[i][2*j];
2488           for (k=0; k<kmax; k++,ct1++) {
2489             row    = rbuf1_i[ct1] - rstart;
2490             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2491             ncols  = nzA + nzB;
2492             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2493 
2494             /* load the column indices for this row into cols */
2495             cols = sbuf_aj_i + ct2;
2496 
2497             lwrite = 0;
2498             for (l=0; l<nzB; l++) {
2499               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2500             }
2501             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2502             for (l=0; l<nzB; l++) {
2503               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2504             }
2505 
2506             ct2 += ncols;
2507           }
2508         }
2509         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2510       }
2511     }
2512     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2513 
2514     /* create col map: global col of C -> local col of submatrices */
2515     {
2516       const PetscInt *icol_i;
2517 #if defined(PETSC_USE_CTABLE)
2518       for (i=0; i<ismax; i++) {
2519         if (!allcolumns[i]) {
2520           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2521 
2522           jmax   = ncol[i];
2523           icol_i = icol[i];
2524           cmap_i = cmap[i];
2525           for (j=0; j<jmax; j++) {
2526             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2527           }
2528         } else cmap[i] = NULL;
2529       }
2530 #else
2531       for (i=0; i<ismax; i++) {
2532         if (!allcolumns[i]) {
2533           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2534           jmax   = ncol[i];
2535           icol_i = icol[i];
2536           cmap_i = cmap[i];
2537           for (j=0; j<jmax; j++) {
2538             cmap_i[icol_i[j]] = j+1;
2539           }
2540         } else cmap[i] = NULL;
2541       }
2542 #endif
2543     }
2544 
2545     /* Create lens which is required for MatCreate... */
2546     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2547     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2548 
2549     if (ismax) {
2550       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2551     }
2552     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2553 
2554     /* Update lens from local data */
2555     for (i=0; i<ismax; i++) {
2556       row2proc_i = row2proc[i];
2557       jmax = nrow[i];
2558       if (!allcolumns[i]) cmap_i = cmap[i];
2559       irow_i = irow[i];
2560       lens_i = lens[i];
2561       for (j=0; j<jmax; j++) {
2562         row = irow_i[j];
2563         proc = row2proc_i[j];
2564         if (proc == rank) {
2565           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2566           if (!allcolumns[i]) {
2567             for (k=0; k<ncols; k++) {
2568 #if defined(PETSC_USE_CTABLE)
2569               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2570 #else
2571               tcol = cmap_i[cols[k]];
2572 #endif
2573               if (tcol) lens_i[j]++;
2574             }
2575           } else { /* allcolumns */
2576             lens_i[j] = ncols;
2577           }
2578           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2579         }
2580       }
2581     }
2582 
2583     /* Create row map: global row of C -> local row of submatrices */
2584 #if defined(PETSC_USE_CTABLE)
2585     for (i=0; i<ismax; i++) {
2586       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2587       irow_i = irow[i];
2588       jmax   = nrow[i];
2589       for (j=0; j<jmax; j++) {
2590       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2591       }
2592     }
2593 #else
2594     for (i=0; i<ismax; i++) {
2595       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2596       rmap_i = rmap[i];
2597       irow_i = irow[i];
2598       jmax   = nrow[i];
2599       for (j=0; j<jmax; j++) {
2600         rmap_i[irow_i[j]] = j;
2601       }
2602     }
2603 #endif
2604 
2605     /* Update lens from offproc data */
2606     {
2607       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2608 
2609       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2610       for (tmp2=0; tmp2<nrqs; tmp2++) {
2611         sbuf1_i = sbuf1[pa[tmp2]];
2612         jmax    = sbuf1_i[0];
2613         ct1     = 2*jmax+1;
2614         ct2     = 0;
2615         rbuf2_i = rbuf2[tmp2];
2616         rbuf3_i = rbuf3[tmp2];
2617         for (j=1; j<=jmax; j++) {
2618           is_no  = sbuf1_i[2*j-1];
2619           max1   = sbuf1_i[2*j];
2620           lens_i = lens[is_no];
2621           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2622           rmap_i = rmap[is_no];
2623           for (k=0; k<max1; k++,ct1++) {
2624 #if defined(PETSC_USE_CTABLE)
2625             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2626             row--;
2627             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2628 #else
2629             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2630 #endif
2631             max2 = rbuf2_i[ct1];
2632             for (l=0; l<max2; l++,ct2++) {
2633               if (!allcolumns[is_no]) {
2634 #if defined(PETSC_USE_CTABLE)
2635                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2636 #else
2637                 tcol = cmap_i[rbuf3_i[ct2]];
2638 #endif
2639                 if (tcol) lens_i[row]++;
2640               } else { /* allcolumns */
2641                 lens_i[row]++; /* lens_i[row] += max2 ? */
2642               }
2643             }
2644           }
2645         }
2646       }
2647     }
2648     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2649     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2650     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2651     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2652 
2653     /* Create the submatrices */
2654     for (i=0; i<ismax; i++) {
2655       PetscInt    rbs,cbs;
2656 
2657       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2658       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2659 
2660       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2661       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2662 
2663       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2664       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2665       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2666 
2667       /* create struct Mat_SubMat and attached it to submat */
2668       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2669       subc = (Mat_SeqAIJ*)submats[i]->data;
2670       subc->submatis1 = smat_i;
2671 
2672       smat_i->destroy          = submats[i]->ops->destroy;
2673       submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
2674       submats[i]->factortype   = C->factortype;
2675 
2676       smat_i->id          = i;
2677       smat_i->nrqs        = nrqs;
2678       smat_i->nrqr        = nrqr;
2679       smat_i->rbuf1       = rbuf1;
2680       smat_i->rbuf2       = rbuf2;
2681       smat_i->rbuf3       = rbuf3;
2682       smat_i->sbuf2       = sbuf2;
2683       smat_i->req_source2 = req_source2;
2684 
2685       smat_i->sbuf1       = sbuf1;
2686       smat_i->ptr         = ptr;
2687       smat_i->tmp         = tmp;
2688       smat_i->ctr         = ctr;
2689 
2690       smat_i->pa           = pa;
2691       smat_i->req_size     = req_size;
2692       smat_i->req_source1  = req_source1;
2693 
2694       smat_i->allcolumns  = allcolumns[i];
2695       smat_i->singleis    = PETSC_FALSE;
2696       smat_i->row2proc    = row2proc[i];
2697       smat_i->rmap        = rmap[i];
2698       smat_i->cmap        = cmap[i];
2699     }
2700 
2701     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2702       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2703       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2704       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2705 
2706       /* create struct Mat_SubMat and attached it to submat */
2707       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2708       submats[0]->data = (void*)smat_i;
2709 
2710       smat_i->destroy          = submats[0]->ops->destroy;
2711       submats[0]->ops->destroy = MatDestroy_Dummy_MatGetSubmatrices;
2712       submats[0]->factortype   = C->factortype;
2713 
2714       smat_i->id          = i;
2715       smat_i->nrqs        = nrqs;
2716       smat_i->nrqr        = nrqr;
2717       smat_i->rbuf1       = rbuf1;
2718       smat_i->rbuf2       = rbuf2;
2719       smat_i->rbuf3       = rbuf3;
2720       smat_i->sbuf2       = sbuf2;
2721       smat_i->req_source2 = req_source2;
2722 
2723       smat_i->sbuf1       = sbuf1;
2724       smat_i->ptr         = ptr;
2725       smat_i->tmp         = tmp;
2726       smat_i->ctr         = ctr;
2727 
2728       smat_i->pa           = pa;
2729       smat_i->req_size     = req_size;
2730       smat_i->req_source1  = req_source1;
2731 
2732       smat_i->allcolumns  = PETSC_TRUE;
2733       smat_i->singleis    = PETSC_FALSE;
2734       smat_i->row2proc    = NULL;
2735       smat_i->rmap        = NULL;
2736       smat_i->cmap        = NULL;
2737     }
2738 
2739     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2740     ierr = PetscFree(lens);CHKERRQ(ierr);
2741     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2742     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2743 
2744   } /* endof scall == MAT_INITIAL_MATRIX */
2745 
2746   /* Post recv matrix values */
2747   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2748   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2749   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2750   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2751   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2752   for (i=0; i<nrqs; ++i) {
2753     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2754     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2755   }
2756 
2757   /* Allocate sending buffers for a->a, and send them off */
2758   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2759   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2760   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2761   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2762 
2763   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2764   {
2765     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2766     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2767     PetscInt    cend   = C->cmap->rend;
2768     PetscInt    *b_j   = b->j;
2769     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2770 
2771     for (i=0; i<nrqr; i++) {
2772       rbuf1_i   = rbuf1[i];
2773       sbuf_aa_i = sbuf_aa[i];
2774       ct1       = 2*rbuf1_i[0]+1;
2775       ct2       = 0;
2776       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2777         kmax = rbuf1_i[2*j];
2778         for (k=0; k<kmax; k++,ct1++) {
2779           row    = rbuf1_i[ct1] - rstart;
2780           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2781           ncols  = nzA + nzB;
2782           cworkB = b_j + b_i[row];
2783           vworkA = a_a + a_i[row];
2784           vworkB = b_a + b_i[row];
2785 
2786           /* load the column values for this row into vals*/
2787           vals = sbuf_aa_i+ct2;
2788 
2789           lwrite = 0;
2790           for (l=0; l<nzB; l++) {
2791             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2792           }
2793           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2794           for (l=0; l<nzB; l++) {
2795             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2796           }
2797 
2798           ct2 += ncols;
2799         }
2800       }
2801       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2802     }
2803   }
2804 
2805   /* Assemble the matrices */
2806   /* First assemble the local rows */
2807   for (i=0; i<ismax; i++) {
2808     row2proc_i = row2proc[i];
2809     subc      = (Mat_SeqAIJ*)submats[i]->data;
2810     imat_ilen = subc->ilen;
2811     imat_j    = subc->j;
2812     imat_i    = subc->i;
2813     imat_a    = subc->a;
2814 
2815     if (!allcolumns[i]) cmap_i = cmap[i];
2816     rmap_i = rmap[i];
2817     irow_i = irow[i];
2818     jmax   = nrow[i];
2819     for (j=0; j<jmax; j++) {
2820       row  = irow_i[j];
2821       proc = row2proc_i[j];
2822       if (proc == rank) {
2823         old_row = row;
2824 #if defined(PETSC_USE_CTABLE)
2825         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2826         row--;
2827 #else
2828         row = rmap_i[row];
2829 #endif
2830         ilen_row = imat_ilen[row];
2831         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2832         mat_i    = imat_i[row];
2833         mat_a    = imat_a + mat_i;
2834         mat_j    = imat_j + mat_i;
2835         if (!allcolumns[i]) {
2836           for (k=0; k<ncols; k++) {
2837 #if defined(PETSC_USE_CTABLE)
2838             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2839 #else
2840             tcol = cmap_i[cols[k]];
2841 #endif
2842             if (tcol) {
2843               *mat_j++ = tcol - 1;
2844               *mat_a++ = vals[k];
2845               ilen_row++;
2846             }
2847           }
2848         } else { /* allcolumns */
2849           for (k=0; k<ncols; k++) {
2850             *mat_j++ = cols[k];  /* global col index! */
2851             *mat_a++ = vals[k];
2852             ilen_row++;
2853           }
2854         }
2855         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2856 
2857         imat_ilen[row] = ilen_row;
2858       }
2859     }
2860   }
2861 
2862   /* Now assemble the off proc rows */
2863   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2864   for (tmp2=0; tmp2<nrqs; tmp2++) {
2865     sbuf1_i = sbuf1[pa[tmp2]];
2866     jmax    = sbuf1_i[0];
2867     ct1     = 2*jmax + 1;
2868     ct2     = 0;
2869     rbuf2_i = rbuf2[tmp2];
2870     rbuf3_i = rbuf3[tmp2];
2871     rbuf4_i = rbuf4[tmp2];
2872     for (j=1; j<=jmax; j++) {
2873       is_no     = sbuf1_i[2*j-1];
2874       rmap_i    = rmap[is_no];
2875       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2876       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2877       imat_ilen = subc->ilen;
2878       imat_j    = subc->j;
2879       imat_i    = subc->i;
2880       imat_a    = subc->a;
2881       max1      = sbuf1_i[2*j];
2882       for (k=0; k<max1; k++,ct1++) {
2883         row = sbuf1_i[ct1];
2884 #if defined(PETSC_USE_CTABLE)
2885         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2886         row--;
2887 #else
2888         row = rmap_i[row];
2889 #endif
2890         ilen  = imat_ilen[row];
2891         mat_i = imat_i[row];
2892         mat_a = imat_a + mat_i;
2893         mat_j = imat_j + mat_i;
2894         max2  = rbuf2_i[ct1];
2895         if (!allcolumns[is_no]) {
2896           for (l=0; l<max2; l++,ct2++) {
2897 #if defined(PETSC_USE_CTABLE)
2898             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2899 #else
2900             tcol = cmap_i[rbuf3_i[ct2]];
2901 #endif
2902             if (tcol) {
2903               *mat_j++ = tcol - 1;
2904               *mat_a++ = rbuf4_i[ct2];
2905               ilen++;
2906             }
2907           }
2908         } else { /* allcolumns */
2909           for (l=0; l<max2; l++,ct2++) {
2910             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2911             *mat_a++ = rbuf4_i[ct2];
2912             ilen++;
2913           }
2914         }
2915         imat_ilen[row] = ilen;
2916       }
2917     }
2918   }
2919 
2920   if (!iscsorted) { /* sort column indices of the rows */
2921     for (i=0; i<ismax; i++) {
2922       subc      = (Mat_SeqAIJ*)submats[i]->data;
2923       imat_j    = subc->j;
2924       imat_i    = subc->i;
2925       imat_a    = subc->a;
2926       imat_ilen = subc->ilen;
2927 
2928       if (allcolumns[i]) continue;
2929       jmax = nrow[i];
2930       for (j=0; j<jmax; j++) {
2931         mat_i = imat_i[j];
2932         mat_a = imat_a + mat_i;
2933         mat_j = imat_j + mat_i;
2934         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2935       }
2936     }
2937   }
2938 
2939   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2940   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2941   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2942   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2943   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2944 
2945   /* Restore the indices */
2946   for (i=0; i<ismax; i++) {
2947     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2948     if (!allcolumns[i]) {
2949       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2950     }
2951   }
2952 
2953   for (i=0; i<ismax; i++) {
2954     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2955     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2956   }
2957 
2958   /* Destroy allocated memory */
2959   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2960   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2961   ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr);
2962 
2963   for (i=0; i<nrqs; ++i) {
2964     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2965   }
2966   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2967 
2968   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2969   PetscFunctionReturn(0);
2970 }
2971 
2972 /*
2973  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2974  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2975  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2976  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2977  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2978  state, and needs to be "assembled" later by compressing B's column space.
2979 
2980  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2981  Following this call, C->A & C->B have been created, even if empty.
2982  */
2983 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2984 {
2985   /* If making this function public, change the error returned in this function away from _PLIB. */
2986   PetscErrorCode ierr;
2987   Mat_MPIAIJ     *aij;
2988   Mat_SeqAIJ     *Baij;
2989   PetscBool      seqaij,Bdisassembled;
2990   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2991   PetscScalar    v;
2992   const PetscInt *rowindices,*colindices;
2993 
2994   PetscFunctionBegin;
2995   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2996   if (A) {
2997     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2998     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2999     if (rowemb) {
3000       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
3001       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
3002     } else {
3003       if (C->rmap->n != A->rmap->n) {
3004 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
3005       }
3006     }
3007     if (dcolemb) {
3008       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
3009       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
3010     } else {
3011       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
3012     }
3013   }
3014   if (B) {
3015     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
3016     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
3017     if (rowemb) {
3018       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
3019       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
3020     } else {
3021       if (C->rmap->n != B->rmap->n) {
3022 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
3023       }
3024     }
3025     if (ocolemb) {
3026       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
3027       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
3028     } else {
3029       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
3030     }
3031   }
3032 
3033   aij = (Mat_MPIAIJ*)C->data;
3034   if (!aij->A) {
3035     /* Mimic parts of MatMPIAIJSetPreallocation() */
3036     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
3037     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
3038     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
3039     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
3040     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
3041   }
3042   if (A) {
3043     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
3044   } else {
3045     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
3046   }
3047   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
3048     /*
3049       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
3050       need to "disassemble" B -- convert it to using C's global indices.
3051       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
3052 
3053       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
3054       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
3055 
3056       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
3057       At least avoid calling MatSetValues() and the implied searches?
3058     */
3059 
3060     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
3061 #if defined(PETSC_USE_CTABLE)
3062       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
3063 #else
3064       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
3065       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
3066       if (aij->B) {
3067         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
3068       }
3069 #endif
3070       ngcol = 0;
3071       if (aij->lvec) {
3072 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
3073       }
3074       if (aij->garray) {
3075 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
3076 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
3077       }
3078       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
3079       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
3080     }
3081     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
3082       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
3083     }
3084     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3085       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
3086     }
3087   }
3088   Bdisassembled = PETSC_FALSE;
3089   if (!aij->B) {
3090     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3091     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3092     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3093     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3094     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3095     Bdisassembled = PETSC_TRUE;
3096   }
3097   if (B) {
3098     Baij = (Mat_SeqAIJ*)B->data;
3099     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3100       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3101       for (i=0; i<B->rmap->n; i++) {
3102 	nz[i] = Baij->i[i+1] - Baij->i[i];
3103       }
3104       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3105       ierr = PetscFree(nz);CHKERRQ(ierr);
3106     }
3107 
3108     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3109     shift = rend-rstart;
3110     count = 0;
3111     rowindices = NULL;
3112     colindices = NULL;
3113     if (rowemb) {
3114       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3115     }
3116     if (ocolemb) {
3117       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3118     }
3119     for (i=0; i<B->rmap->n; i++) {
3120       PetscInt row;
3121       row = i;
3122       if (rowindices) row = rowindices[i];
3123       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3124 	col  = Baij->j[count];
3125 	if (colindices) col = colindices[col];
3126 	if (Bdisassembled && col>=rstart) col += shift;
3127 	v    = Baij->a[count];
3128 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3129 	++count;
3130       }
3131     }
3132     /* No assembly for aij->B is necessary. */
3133     /* FIXME: set aij->B's nonzerostate correctly. */
3134   } else {
3135     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3136   }
3137   C->preallocated  = PETSC_TRUE;
3138   C->was_assembled = PETSC_FALSE;
3139   C->assembled     = PETSC_FALSE;
3140    /*
3141       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3142       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3143    */
3144   PetscFunctionReturn(0);
3145 }
3146 
3147 /*
3148   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3149  */
3150 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3151 {
3152   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3153 
3154   PetscFunctionBegin;
3155   PetscValidPointer(A,2);
3156   PetscValidPointer(B,3);
3157   /* FIXME: make sure C is assembled */
3158   *A = aij->A;
3159   *B = aij->B;
3160   /* Note that we don't incref *A and *B, so be careful! */
3161   PetscFunctionReturn(0);
3162 }
3163 
3164 /*
3165   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3166   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3167 */
3168 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3169                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3170 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3171 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3172 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3173 {
3174   PetscErrorCode ierr;
3175   PetscMPIInt    isize,flag;
3176   PetscInt       i,ii,cismax,ispar;
3177   Mat            *A,*B;
3178   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3179 
3180   PetscFunctionBegin;
3181   if (!ismax) PetscFunctionReturn(0);
3182 
3183   for (i = 0, cismax = 0; i < ismax; ++i) {
3184     PetscMPIInt isize;
3185     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3186     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3187     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3188     if (isize > 1) ++cismax;
3189   }
3190 
3191   /*
3192      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3193      ispar counts the number of parallel ISs across C's comm.
3194   */
3195   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3196   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3197     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3198     PetscFunctionReturn(0);
3199   }
3200 
3201   /* if (ispar) */
3202   /*
3203     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3204     These are used to extract the off-diag portion of the resulting parallel matrix.
3205     The row IS for the off-diag portion is the same as for the diag portion,
3206     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3207   */
3208   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3209   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3210   for (i = 0, ii = 0; i < ismax; ++i) {
3211     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3212     if (isize > 1) {
3213       /*
3214 	 TODO: This is the part that's ***NOT SCALABLE***.
3215 	 To fix this we need to extract just the indices of C's nonzero columns
3216 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3217 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3218 	 to be done without serializing on the IS list, so, most likely, it is best
3219 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3220       */
3221       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3222       /* Now we have to
3223 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3224 	     were sorted on each rank, concatenated they might no longer be sorted;
3225 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3226 	     indices in the nondecreasing order to the original index positions.
3227 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3228       */
3229       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3230       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3231       ++ii;
3232     }
3233   }
3234   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3235   for (i = 0, ii = 0; i < ismax; ++i) {
3236     PetscInt       j,issize;
3237     const PetscInt *indices;
3238 
3239     /*
3240        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3241      */
3242     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3243     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3244     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3245     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3246     for (j = 1; j < issize; ++j) {
3247       if (indices[j] == indices[j-1]) {
3248 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3249       }
3250     }
3251     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3252 
3253 
3254     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3255     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3256     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3257     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3258     for (j = 1; j < issize; ++j) {
3259       if (indices[j-1] == indices[j]) {
3260 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3261       }
3262     }
3263     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3264     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3265     if (isize > 1) {
3266       cisrow[ii] = isrow[i];
3267       ++ii;
3268     }
3269   }
3270   /*
3271     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3272     array of sequential matrices underlying the resulting parallel matrices.
3273     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3274     contain duplicates.
3275 
3276     There are as many diag matrices as there are original index sets. There are only as many parallel
3277     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3278 
3279     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3280     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3281       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3282       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3283       with A[i] and B[ii] extracted from the corresponding MPI submat.
3284     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3285       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3286       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3287       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3288       values into A[i] and B[ii] sitting inside the corresponding submat.
3289     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3290       A[i], B[ii], AA[i] or BB[ii] matrices.
3291   */
3292   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3293   if (scall == MAT_INITIAL_MATRIX) {
3294     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3295   }
3296 
3297   /* Now obtain the sequential A and B submatrices separately. */
3298   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3299   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3300   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3301 
3302   /*
3303     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3304     matrices A & B have been extracted directly into the parallel matrices containing them, or
3305     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3306     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3307     to have the same sparsity pattern.
3308     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3309     must be constructed for C. This is done by setseqmat(s).
3310   */
3311   for (i = 0, ii = 0; i < ismax; ++i) {
3312     /*
3313        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3314        That way we can avoid sorting and computing permutations when reusing.
3315        To this end:
3316         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3317 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3318 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3319     */
3320     MatStructure pattern;
3321     pattern = DIFFERENT_NONZERO_PATTERN;
3322 
3323     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3324     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3325     if (isize > 1) {
3326       if (scall == MAT_INITIAL_MATRIX) {
3327 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3328 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3329 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3330 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3331 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3332       }
3333       /*
3334 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3335       */
3336       {
3337 	Mat AA,BB;
3338         AA = A[i];
3339         BB = B[ii];
3340 	if (AA || BB) {
3341 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3342 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3343 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3344 	}
3345 
3346         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3347       }
3348       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3349       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3350       ++ii;
3351     } else { /* if (isize == 1) */
3352       if (scall == MAT_REUSE_MATRIX) {
3353         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3354       }
3355       if (isrow_p[i] || iscol_p[i]) {
3356         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3357         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3358 	/* Otherwise A is extracted straight into (*submats)[i]. */
3359 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3360 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3361       } else (*submat)[i] = A[i];
3362     }
3363     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3364     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3365   }
3366   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3367   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3368   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3369   ierr = PetscFree(A);CHKERRQ(ierr);
3370   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3371   PetscFunctionReturn(0);
3372 }
3373 
3374 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3375 {
3376   PetscErrorCode ierr;
3377 
3378   PetscFunctionBegin;
3379   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3380   PetscFunctionReturn(0);
3381 }
3382