xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 1ebf93c6b7d760d592de6ebe6cdc0debaa3caf75)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 #include <../src/mat/impls/aij/mpi/mpiaij.h>
8 #include <petscbt.h>
9 #include <petscsf.h>
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
20 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
21 
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
25 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
26 {
27   PetscErrorCode ierr;
28   PetscInt       i;
29 
30   PetscFunctionBegin;
31   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
32   for (i=0; i<ov; ++i) {
33     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
34   }
35   PetscFunctionReturn(0);
36 }
37 
38 #undef __FUNCT__
39 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
40 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
41 {
42   PetscErrorCode ierr;
43   PetscInt       i;
44 
45   PetscFunctionBegin;
46   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
47   for (i=0; i<ov; ++i) {
48     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
49   }
50   PetscFunctionReturn(0);
51 }
52 
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
57 {
58   PetscErrorCode   ierr;
59   MPI_Comm         comm;
60   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
61   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
62   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
63   const PetscInt  *indices_i,**indices;
64   PetscLayout      rmap;
65   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
66   PetscSF          sf;
67   PetscSFNode     *remote;
68 
69   PetscFunctionBegin;
70   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
71   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
72   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
73   /* get row map to determine where rows should be going */
74   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
75   /* retrieve IS data and put all together so that we
76    * can optimize communication
77    *  */
78   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
79   for (i=0,tlength=0; i<nidx; i++){
80     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
81     tlength += length[i];
82     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
83   }
84   /* find these rows on remote processors */
85   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
86   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
87   nrrows = 0;
88   for (i=0; i<nidx; i++){
89     length_i     = length[i];
90     indices_i    = indices[i];
91     for (j=0; j<length_i; j++){
92       owner = -1;
93       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
94       /* remote processors */
95       if (owner != rank){
96         tosizes_temp[owner]++; /* number of rows to owner */
97         rrow_ranks[nrrows]  = owner; /* processor */
98         rrow_isids[nrrows]   = i; /* is id */
99         remoterows[nrrows++] = indices_i[j]; /* row */
100       }
101     }
102     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
103   }
104   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
105   /* test if we need to exchange messages
106    * generally speaking, we do not need to exchange
107    * data when overlap is 1
108    * */
109   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
110   /* we do not have any messages
111    * It usually corresponds to overlap 1
112    * */
113   if (!reducednrrows){
114     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
115     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
116     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
117     PetscFunctionReturn(0);
118   }
119   nto = 0;
120   /* send sizes and ranks for building a two-sided communcation */
121   for (i=0; i<size; i++){
122    if (tosizes_temp[i]){
123      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
124      tosizes_temp[i] = nto; /* a map from processor to index */
125      toranks[nto++]  = i; /* processor */
126    }
127   }
128   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
129   for (i=0; i<nto; i++){
130     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
131     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
132   }
133   /* send information to other processors */
134   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
135   nrecvrows = 0;
136   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
137   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
138   nrecvrows = 0;
139   for (i=0; i<nfrom; i++){
140     for (j=0; j<fromsizes[2*i]; j++){
141       remote[nrecvrows].rank    = fromranks[i];
142       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
143     }
144   }
145   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
146   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
147   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
148   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
149   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
150   /* message pair <no of is, row>  */
151   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
152   for (i=0; i<nrrows; i++){
153     owner = rrow_ranks[i]; /* processor */
154     j     = tosizes_temp[owner]; /* index */
155     todata[toffsets[j]++] = rrow_isids[i];
156     todata[toffsets[j]++] = remoterows[i];
157   }
158   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
159   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
160   ierr = PetscFree(toffsets);CHKERRQ(ierr);
161   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
162   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
163   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
164   /* send rows belonging to the remote so that then we could get the overlapping data back */
165   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
166   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
167   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
168   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
169   ierr = PetscFree(fromranks);CHKERRQ(ierr);
170   nrecvrows = 0;
171   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
172   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
173   ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr);
174   nrecvrows = 0;
175   for (i=0; i<nto; i++){
176     for (j=0; j<tosizes[2*i]; j++){
177       remote[nrecvrows].rank    = toranks[i];
178       remote[nrecvrows++].index = tosizes[2*i+1]+j;
179     }
180   }
181   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
182   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
183   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
184   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
185   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
186   /* overlap communication and computation */
187   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
188   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
189   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
190   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
191   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
192   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
193   ierr = PetscFree(toranks);CHKERRQ(ierr);
194   ierr = PetscFree(tosizes);CHKERRQ(ierr);
195   ierr = PetscFree(todata);CHKERRQ(ierr);
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
201 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
202 {
203   PetscInt         *isz,isz_i,i,j,is_id, data_size;
204   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
205   const PetscInt   *indices_i_temp;
206   PetscErrorCode    ierr;
207 
208   PetscFunctionBegin;
209   max_lsize = 0;
210   ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr);
211   for (i=0; i<nidx; i++){
212     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
213     max_lsize = lsize>max_lsize ? lsize:max_lsize;
214     isz[i]    = lsize;
215   }
216   ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr);
217   for (i=0; i<nidx; i++){
218     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
219     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
220     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
221     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
222   }
223   /* retrieve information to get row id and its overlap */
224   for (i=0; i<nrecvs; ){
225     is_id      = recvdata[i++];
226     data_size  = recvdata[i++];
227     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
228     isz_i      = isz[is_id];
229     for (j=0; j< data_size; j++){
230       col = recvdata[i++];
231       indices_i[isz_i++] = col;
232     }
233     isz[is_id] = isz_i;
234   }
235   /* remove duplicate entities */
236   for (i=0; i<nidx; i++){
237     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
238     isz_i      = isz[i];
239     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
240     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
241   }
242   ierr = PetscFree(isz);CHKERRQ(ierr);
243   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
244   PetscFunctionReturn(0);
245 }
246 
247 #undef __FUNCT__
248 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
249 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
250 {
251   PetscLayout       rmap,cmap;
252   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
253   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
254   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
255   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
256   Mat               amat,bmat;
257   PetscMPIInt       rank;
258   PetscBool         done;
259   MPI_Comm          comm;
260   PetscErrorCode    ierr;
261 
262   PetscFunctionBegin;
263   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
264   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
265   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
266   /* Even if the mat is symmetric, we still assume it is not symmetric */
267   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
268   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
269   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
270   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
271   /* total number of nonzero values is used to estimate the memory usage in the next step */
272   tnz  = ai[an]+bi[bn];
273   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
274   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
275   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
276   /* to find the longest message */
277   max_fszs = 0;
278   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
279   /* better way to estimate number of nonzero in the mat??? */
280   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
281   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
282   rows_pos  = 0;
283   totalrows = 0;
284   for (i=0; i<nfrom; i++){
285     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
286     /* group data together */
287     for (j=0; j<fromsizes[2*i]; j+=2){
288       is_id                       = fromrows[rows_pos++];/* no of is */
289       rows_i                      = rows_data[is_id];
290       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
291     }
292     /* estimate a space to avoid multiple allocations  */
293     for (j=0; j<nidx; j++){
294       indvc_ij = 0;
295       rows_i   = rows_data[j];
296       for (l=0; l<rows_pos_i[j]; l++){
297         row    = rows_i[l]-rstart;
298         start  = ai[row];
299         end    = ai[row+1];
300         for (k=start; k<end; k++){ /* Amat */
301           col = aj[k] + cstart;
302           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
303         }
304         start = bi[row];
305         end   = bi[row+1];
306         for (k=start; k<end; k++) { /* Bmat */
307           col = gcols[bj[k]];
308           indices_tmp[indvc_ij++] = col;
309         }
310       }
311       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
312       indv_counts[i*nidx+j] = indvc_ij;
313       totalrows            += indvc_ij;
314     }
315   }
316   /* message triple <no of is, number of rows, rows> */
317   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
318   totalrows = 0;
319   rows_pos  = 0;
320   /* use this code again */
321   for (i=0;i<nfrom;i++){
322     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
323     for (j=0; j<fromsizes[2*i]; j+=2){
324       is_id                       = fromrows[rows_pos++];
325       rows_i                      = rows_data[is_id];
326       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
327     }
328     /* add data  */
329     for (j=0; j<nidx; j++){
330       if (!indv_counts[i*nidx+j]) continue;
331       indvc_ij = 0;
332       sbdata[totalrows++] = j;
333       sbdata[totalrows++] = indv_counts[i*nidx+j];
334       sbsizes[2*i]       += 2;
335       rows_i              = rows_data[j];
336       for (l=0; l<rows_pos_i[j]; l++){
337         row   = rows_i[l]-rstart;
338         start = ai[row];
339         end   = ai[row+1];
340         for (k=start; k<end; k++){ /* Amat */
341           col = aj[k] + cstart;
342           indices_tmp[indvc_ij++] = col;
343         }
344         start = bi[row];
345         end   = bi[row+1];
346         for (k=start; k<end; k++) { /* Bmat */
347           col = gcols[bj[k]];
348           indices_tmp[indvc_ij++] = col;
349         }
350       }
351       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
352       sbsizes[2*i]  += indvc_ij;
353       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
354       totalrows += indvc_ij;
355     }
356   }
357   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
358   for (i=0; i<nfrom; i++){
359     offsets[i+1]   = offsets[i] + sbsizes[2*i];
360     sbsizes[2*i+1] = offsets[i];
361   }
362   ierr = PetscFree(offsets);CHKERRQ(ierr);
363   if (sbrowsizes) *sbrowsizes = sbsizes;
364   if (sbrows) *sbrows = sbdata;
365   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
367   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
368   PetscFunctionReturn(0);
369 }
370 
371 #undef __FUNCT__
372 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
373 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
374 {
375   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
376   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
377   PetscInt          lsize,lsize_tmp,owner;
378   PetscMPIInt       rank;
379   Mat                   amat,bmat;
380   PetscBool         done;
381   PetscLayout       cmap,rmap;
382   MPI_Comm          comm;
383   PetscErrorCode    ierr;
384 
385   PetscFunctionBegin;
386   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
387   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
388   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
389   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
390   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
391   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
392   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
393   /* is it a safe way to compute number of nonzero values ? */
394   tnz  = ai[an]+bi[bn];
395   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
397   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
398   /* it is a better way to estimate memory than the old implementation
399    * where global size of matrix is used
400    * */
401   ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr);
402   for (i=0; i<nidx; i++) {
403     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
404     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
405     lsize_tmp = 0;
406     for (j=0; j<lsize; j++) {
407       owner = -1;
408       row   = indices[j];
409       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
410       if (owner != rank) continue;
411       /* local number */
412       row  -= rstart;
413       start = ai[row];
414       end   = ai[row+1];
415       for (k=start; k<end; k++) { /* Amat */
416         col = aj[k] + cstart;
417         indices_temp[lsize_tmp++] = col;
418       }
419       start = bi[row];
420       end   = bi[row+1];
421       for (k=start; k<end; k++) { /* Bmat */
422         col = gcols[bj[k]];
423         indices_temp[lsize_tmp++] = col;
424       }
425     }
426    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
427    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
428    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
429    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
430   }
431   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
433   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
434   PetscFunctionReturn(0);
435 }
436 
437 
438 /*
439   Sample message format:
440   If a processor A wants processor B to process some elements corresponding
441   to index sets is[1],is[5]
442   mesg [0] = 2   (no of index sets in the mesg)
443   -----------
444   mesg [1] = 1 => is[1]
445   mesg [2] = sizeof(is[1]);
446   -----------
447   mesg [3] = 5  => is[5]
448   mesg [4] = sizeof(is[5]);
449   -----------
450   mesg [5]
451   mesg [n]  datas[1]
452   -----------
453   mesg[n+1]
454   mesg[m]  data(is[5])
455   -----------
456 
457   Notes:
458   nrqs - no of requests sent (or to be sent out)
459   nrqr - no of requests recieved (which have to be or which have been processed
460 */
461 #undef __FUNCT__
462 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
463 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
464 {
465   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
466   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
467   const PetscInt **idx,*idx_i;
468   PetscInt       *n,**data,len;
469   PetscErrorCode ierr;
470   PetscMPIInt    size,rank,tag1,tag2;
471   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
472   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
473   PetscBT        *table;
474   MPI_Comm       comm;
475   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476   MPI_Status     *s_status,*recv_status;
477   char           *t_p;
478 
479   PetscFunctionBegin;
480   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
481   size = c->size;
482   rank = c->rank;
483   M    = C->rmap->N;
484 
485   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
486   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
487 
488   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
489 
490   for (i=0; i<imax; i++) {
491     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
492     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
493   }
494 
495   /* evaluate communication - mesg to who,length of mesg, and buffer space
496      required. Based on this, buffers are allocated, and data copied into them  */
497   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
498   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
499   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
500   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
501   for (i=0; i<imax; i++) {
502     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
503     idx_i = idx[i];
504     len   = n[i];
505     for (j=0; j<len; j++) {
506       row = idx_i[j];
507       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
508       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
509       w4[proc]++;
510     }
511     for (j=0; j<size; j++) {
512       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
513     }
514   }
515 
516   nrqs     = 0;              /* no of outgoing messages */
517   msz      = 0;              /* total mesg length (for all proc */
518   w1[rank] = 0;              /* no mesg sent to intself */
519   w3[rank] = 0;
520   for (i=0; i<size; i++) {
521     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
522   }
523   /* pa - is list of processors to communicate with */
524   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
525   for (i=0,j=0; i<size; i++) {
526     if (w1[i]) {pa[j] = i; j++;}
527   }
528 
529   /* Each message would have a header = 1 + 2*(no of IS) + data */
530   for (i=0; i<nrqs; i++) {
531     j      = pa[i];
532     w1[j] += w2[j] + 2*w3[j];
533     msz   += w1[j];
534   }
535 
536   /* Determine the number of messages to expect, their lengths, from from-ids */
537   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
538   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
539 
540   /* Now post the Irecvs corresponding to these messages */
541   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
542 
543   /* Allocate Memory for outgoing messages */
544   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
545   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
546   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
547 
548   {
549     PetscInt *iptr = tmp,ict  = 0;
550     for (i=0; i<nrqs; i++) {
551       j         = pa[i];
552       iptr     +=  ict;
553       outdat[j] = iptr;
554       ict       = w1[j];
555     }
556   }
557 
558   /* Form the outgoing messages */
559   /* plug in the headers */
560   for (i=0; i<nrqs; i++) {
561     j            = pa[i];
562     outdat[j][0] = 0;
563     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
564     ptr[j]       = outdat[j] + 2*w3[j] + 1;
565   }
566 
567   /* Memory for doing local proc's work */
568   {
569     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
570 
571     for (i=0; i<imax; i++) {
572       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
573       data[i]  = d_p + M*i;
574     }
575   }
576 
577   /* Parse the IS and update local tables and the outgoing buf with the data */
578   {
579     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
580     PetscBT  table_i;
581 
582     for (i=0; i<imax; i++) {
583       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
584       n_i     = n[i];
585       table_i = table[i];
586       idx_i   = idx[i];
587       data_i  = data[i];
588       isz_i   = isz[i];
589       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
590         row  = idx_i[j];
591         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
592         if (proc != rank) { /* copy to the outgoing buffer */
593           ctr[proc]++;
594           *ptr[proc] = row;
595           ptr[proc]++;
596         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
597       }
598       /* Update the headers for the current IS */
599       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
600         if ((ctr_j = ctr[j])) {
601           outdat_j        = outdat[j];
602           k               = ++outdat_j[0];
603           outdat_j[2*k]   = ctr_j;
604           outdat_j[2*k-1] = i;
605         }
606       }
607       isz[i] = isz_i;
608     }
609   }
610 
611   /*  Now  post the sends */
612   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
613   for (i=0; i<nrqs; ++i) {
614     j    = pa[i];
615     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
616   }
617 
618   /* No longer need the original indices */
619   for (i=0; i<imax; ++i) {
620     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
621   }
622   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
623 
624   for (i=0; i<imax; ++i) {
625     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
626   }
627 
628   /* Do Local work */
629   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
630 
631   /* Receive messages */
632   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
633   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
634 
635   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
636   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
637 
638   /* Phase 1 sends are complete - deallocate buffers */
639   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
640   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
641 
642   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
643   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
644   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
645   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
646   ierr = PetscFree(rbuf);CHKERRQ(ierr);
647 
648 
649   /* Send the data back */
650   /* Do a global reduction to know the buffer space req for incoming messages */
651   {
652     PetscMPIInt *rw1;
653 
654     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
655 
656     for (i=0; i<nrqr; ++i) {
657       proc = recv_status[i].MPI_SOURCE;
658 
659       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
660       rw1[proc] = isz1[i];
661     }
662     ierr = PetscFree(onodes1);CHKERRQ(ierr);
663     ierr = PetscFree(olengths1);CHKERRQ(ierr);
664 
665     /* Determine the number of messages to expect, their lengths, from from-ids */
666     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
667     ierr = PetscFree(rw1);CHKERRQ(ierr);
668   }
669   /* Now post the Irecvs corresponding to these messages */
670   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
671 
672   /* Now  post the sends */
673   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
674   for (i=0; i<nrqr; ++i) {
675     j    = recv_status[i].MPI_SOURCE;
676     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
677   }
678 
679   /* receive work done on other processors */
680   {
681     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
682     PetscMPIInt idex;
683     PetscBT     table_i;
684     MPI_Status  *status2;
685 
686     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
687     for (i=0; i<nrqs; ++i) {
688       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
689       /* Process the message */
690       rbuf2_i = rbuf2[idex];
691       ct1     = 2*rbuf2_i[0]+1;
692       jmax    = rbuf2[idex][0];
693       for (j=1; j<=jmax; j++) {
694         max     = rbuf2_i[2*j];
695         is_no   = rbuf2_i[2*j-1];
696         isz_i   = isz[is_no];
697         data_i  = data[is_no];
698         table_i = table[is_no];
699         for (k=0; k<max; k++,ct1++) {
700           row = rbuf2_i[ct1];
701           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
702         }
703         isz[is_no] = isz_i;
704       }
705     }
706 
707     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
708     ierr = PetscFree(status2);CHKERRQ(ierr);
709   }
710 
711   for (i=0; i<imax; ++i) {
712     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
713   }
714 
715   ierr = PetscFree(onodes2);CHKERRQ(ierr);
716   ierr = PetscFree(olengths2);CHKERRQ(ierr);
717 
718   ierr = PetscFree(pa);CHKERRQ(ierr);
719   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
720   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
721   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
722   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
723   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
724   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
725   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
726   ierr = PetscFree(s_status);CHKERRQ(ierr);
727   ierr = PetscFree(recv_status);CHKERRQ(ierr);
728   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
729   ierr = PetscFree(xdata);CHKERRQ(ierr);
730   ierr = PetscFree(isz1);CHKERRQ(ierr);
731   PetscFunctionReturn(0);
732 }
733 
734 #undef __FUNCT__
735 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
736 /*
737    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
738        the work on the local processor.
739 
740      Inputs:
741       C      - MAT_MPIAIJ;
742       imax - total no of index sets processed at a time;
743       table  - an array of char - size = m bits.
744 
745      Output:
746       isz    - array containing the count of the solution elements corresponding
747                to each index set;
748       data   - pointer to the solutions
749 */
750 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
751 {
752   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
753   Mat        A  = c->A,B = c->B;
754   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
755   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
756   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
757   PetscBT    table_i;
758 
759   PetscFunctionBegin;
760   rstart = C->rmap->rstart;
761   cstart = C->cmap->rstart;
762   ai     = a->i;
763   aj     = a->j;
764   bi     = b->i;
765   bj     = b->j;
766   garray = c->garray;
767 
768 
769   for (i=0; i<imax; i++) {
770     data_i  = data[i];
771     table_i = table[i];
772     isz_i   = isz[i];
773     for (j=0,max=isz[i]; j<max; j++) {
774       row   = data_i[j] - rstart;
775       start = ai[row];
776       end   = ai[row+1];
777       for (k=start; k<end; k++) { /* Amat */
778         val = aj[k] + cstart;
779         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
780       }
781       start = bi[row];
782       end   = bi[row+1];
783       for (k=start; k<end; k++) { /* Bmat */
784         val = garray[bj[k]];
785         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
786       }
787     }
788     isz[i] = isz_i;
789   }
790   PetscFunctionReturn(0);
791 }
792 
793 #undef __FUNCT__
794 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
795 /*
796       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
797          and return the output
798 
799          Input:
800            C    - the matrix
801            nrqr - no of messages being processed.
802            rbuf - an array of pointers to the recieved requests
803 
804          Output:
805            xdata - array of messages to be sent back
806            isz1  - size of each message
807 
808   For better efficiency perhaps we should malloc separately each xdata[i],
809 then if a remalloc is required we need only copy the data for that one row
810 rather then all previous rows as it is now where a single large chunck of
811 memory is used.
812 
813 */
814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
815 {
816   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
817   Mat            A  = c->A,B = c->B;
818   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
819   PetscErrorCode ierr;
820   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
821   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
822   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
823   PetscInt       *rbuf_i,kmax,rbuf_0;
824   PetscBT        xtable;
825 
826   PetscFunctionBegin;
827   m      = C->rmap->N;
828   rstart = C->rmap->rstart;
829   cstart = C->cmap->rstart;
830   ai     = a->i;
831   aj     = a->j;
832   bi     = b->i;
833   bj     = b->j;
834   garray = c->garray;
835 
836 
837   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
838     rbuf_i =  rbuf[i];
839     rbuf_0 =  rbuf_i[0];
840     ct    += rbuf_0;
841     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
842   }
843 
844   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
845   else max1 = 1;
846   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
847   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
848   ++no_malloc;
849   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
850   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
851 
852   ct3 = 0;
853   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
854     rbuf_i =  rbuf[i];
855     rbuf_0 =  rbuf_i[0];
856     ct1    =  2*rbuf_0+1;
857     ct2    =  ct1;
858     ct3   += ct1;
859     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
860       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
861       oct2 = ct2;
862       kmax = rbuf_i[2*j];
863       for (k=0; k<kmax; k++,ct1++) {
864         row = rbuf_i[ct1];
865         if (!PetscBTLookupSet(xtable,row)) {
866           if (!(ct3 < mem_estimate)) {
867             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
868             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
869             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
870             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
871             xdata[0]     = tmp;
872             mem_estimate = new_estimate; ++no_malloc;
873             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
874           }
875           xdata[i][ct2++] = row;
876           ct3++;
877         }
878       }
879       for (k=oct2,max2=ct2; k<max2; k++) {
880         row   = xdata[i][k] - rstart;
881         start = ai[row];
882         end   = ai[row+1];
883         for (l=start; l<end; l++) {
884           val = aj[l] + cstart;
885           if (!PetscBTLookupSet(xtable,val)) {
886             if (!(ct3 < mem_estimate)) {
887               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
888               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
889               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
890               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
891               xdata[0]     = tmp;
892               mem_estimate = new_estimate; ++no_malloc;
893               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
894             }
895             xdata[i][ct2++] = val;
896             ct3++;
897           }
898         }
899         start = bi[row];
900         end   = bi[row+1];
901         for (l=start; l<end; l++) {
902           val = garray[bj[l]];
903           if (!PetscBTLookupSet(xtable,val)) {
904             if (!(ct3 < mem_estimate)) {
905               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
906               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
907               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
908               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
909               xdata[0]     = tmp;
910               mem_estimate = new_estimate; ++no_malloc;
911               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
912             }
913             xdata[i][ct2++] = val;
914             ct3++;
915           }
916         }
917       }
918       /* Update the header*/
919       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
920       xdata[i][2*j-1] = rbuf_i[2*j-1];
921     }
922     xdata[i][0] = rbuf_0;
923     xdata[i+1]  = xdata[i] + ct2;
924     isz1[i]     = ct2; /* size of each message */
925   }
926   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
927   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 /* -------------------------------------------------------------------------*/
931 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
932 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
933 /*
934     Every processor gets the entire matrix
935 */
936 #undef __FUNCT__
937 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
938 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
939 {
940   Mat            B;
941   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
942   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
943   PetscErrorCode ierr;
944   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
945   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
946   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
947   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
948 
949   PetscFunctionBegin;
950   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
951   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
952 
953   if (scall == MAT_INITIAL_MATRIX) {
954     /* ----------------------------------------------------------------
955          Tell every processor the number of nonzeros per row
956     */
957     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
958     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
959       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
960     }
961     sendcount = A->rmap->rend - A->rmap->rstart;
962     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
963     for (i=0; i<size; i++) {
964       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
965       displs[i]     = A->rmap->range[i];
966     }
967 #if defined(PETSC_HAVE_MPI_IN_PLACE)
968     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
969 #else
970     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
971 #endif
972     /* ---------------------------------------------------------------
973          Create the sequential matrix of the same type as the local block diagonal
974     */
975     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
976     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
977     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
978     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
979     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
980     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
981     **Bin = B;
982     b     = (Mat_SeqAIJ*)B->data;
983 
984     /*--------------------------------------------------------------------
985        Copy my part of matrix column indices over
986     */
987     sendcount  = ad->nz + bd->nz;
988     jsendbuf   = b->j + b->i[rstarts[rank]];
989     a_jsendbuf = ad->j;
990     b_jsendbuf = bd->j;
991     n          = A->rmap->rend - A->rmap->rstart;
992     cnt        = 0;
993     for (i=0; i<n; i++) {
994 
995       /* put in lower diagonal portion */
996       m = bd->i[i+1] - bd->i[i];
997       while (m > 0) {
998         /* is it above diagonal (in bd (compressed) numbering) */
999         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1000         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1001         m--;
1002       }
1003 
1004       /* put in diagonal portion */
1005       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1006         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1007       }
1008 
1009       /* put in upper diagonal portion */
1010       while (m-- > 0) {
1011         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1012       }
1013     }
1014     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1015 
1016     /*--------------------------------------------------------------------
1017        Gather all column indices to all processors
1018     */
1019     for (i=0; i<size; i++) {
1020       recvcounts[i] = 0;
1021       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1022         recvcounts[i] += lens[j];
1023       }
1024     }
1025     displs[0] = 0;
1026     for (i=1; i<size; i++) {
1027       displs[i] = displs[i-1] + recvcounts[i-1];
1028     }
1029 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1030     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1031 #else
1032     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1033 #endif
1034     /*--------------------------------------------------------------------
1035         Assemble the matrix into useable form (note numerical values not yet set)
1036     */
1037     /* set the b->ilen (length of each row) values */
1038     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1039     /* set the b->i indices */
1040     b->i[0] = 0;
1041     for (i=1; i<=A->rmap->N; i++) {
1042       b->i[i] = b->i[i-1] + lens[i-1];
1043     }
1044     ierr = PetscFree(lens);CHKERRQ(ierr);
1045     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1046     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1047 
1048   } else {
1049     B = **Bin;
1050     b = (Mat_SeqAIJ*)B->data;
1051   }
1052 
1053   /*--------------------------------------------------------------------
1054        Copy my part of matrix numerical values into the values location
1055   */
1056   if (flag == MAT_GET_VALUES) {
1057     sendcount = ad->nz + bd->nz;
1058     sendbuf   = b->a + b->i[rstarts[rank]];
1059     a_sendbuf = ad->a;
1060     b_sendbuf = bd->a;
1061     b_sendj   = bd->j;
1062     n         = A->rmap->rend - A->rmap->rstart;
1063     cnt       = 0;
1064     for (i=0; i<n; i++) {
1065 
1066       /* put in lower diagonal portion */
1067       m = bd->i[i+1] - bd->i[i];
1068       while (m > 0) {
1069         /* is it above diagonal (in bd (compressed) numbering) */
1070         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1071         sendbuf[cnt++] = *b_sendbuf++;
1072         m--;
1073         b_sendj++;
1074       }
1075 
1076       /* put in diagonal portion */
1077       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1078         sendbuf[cnt++] = *a_sendbuf++;
1079       }
1080 
1081       /* put in upper diagonal portion */
1082       while (m-- > 0) {
1083         sendbuf[cnt++] = *b_sendbuf++;
1084         b_sendj++;
1085       }
1086     }
1087     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1088 
1089     /* -----------------------------------------------------------------
1090        Gather all numerical values to all processors
1091     */
1092     if (!recvcounts) {
1093       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1094     }
1095     for (i=0; i<size; i++) {
1096       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1097     }
1098     displs[0] = 0;
1099     for (i=1; i<size; i++) {
1100       displs[i] = displs[i-1] + recvcounts[i-1];
1101     }
1102     recvbuf = b->a;
1103 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1104     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1105 #else
1106     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1107 #endif
1108   }  /* endof (flag == MAT_GET_VALUES) */
1109   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1110 
1111   if (A->symmetric) {
1112     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1113   } else if (A->hermitian) {
1114     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1115   } else if (A->structurally_symmetric) {
1116     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1117   }
1118   PetscFunctionReturn(0);
1119 }
1120 
1121 
1122 
1123 #undef __FUNCT__
1124 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
1125 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1126 {
1127   PetscErrorCode ierr;
1128   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1129   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1130 
1131   PetscFunctionBegin;
1132 
1133   /*
1134        Check for special case: each processor gets entire matrix
1135   */
1136   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1137     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1138     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1139     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1140     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1141     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1142       wantallmatrix = PETSC_TRUE;
1143 
1144       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1145     }
1146   }
1147   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1148   if (twantallmatrix) {
1149     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
1150     PetscFunctionReturn(0);
1151   }
1152 
1153   /* Allocate memory to hold all the submatrices */
1154   if (scall != MAT_REUSE_MATRIX) {
1155     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
1156   }
1157 
1158   /* Check for special case: each processor gets entire matrix columns */
1159   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
1160   for (i=0; i<ismax; i++) {
1161     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
1162     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
1163     if (colflag && ncol == C->cmap->N) {
1164       allcolumns[i] = PETSC_TRUE;
1165     } else {
1166       allcolumns[i] = PETSC_FALSE;
1167     }
1168   }
1169 
1170   /* Determine the number of stages through which submatrices are done */
1171   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1172 
1173   /*
1174      Each stage will extract nmax submatrices.
1175      nmax is determined by the matrix column dimension.
1176      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1177   */
1178   if (!nmax) nmax = 1;
1179   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1180 
1181   /* Make sure every processor loops through the nstages */
1182   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1183 
1184   for (i=0,pos=0; i<nstages; i++) {
1185     if (pos+nmax <= ismax) max_no = nmax;
1186     else if (pos == ismax) max_no = 0;
1187     else                   max_no = ismax-pos;
1188     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
1189     pos += max_no;
1190   }
1191 
1192   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 /* -------------------------------------------------------------------------*/
1197 #undef __FUNCT__
1198 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
1199 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1200 {
1201   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1202   Mat            A  = c->A;
1203   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1204   const PetscInt **icol,**irow;
1205   PetscInt       *nrow,*ncol,start;
1206   PetscErrorCode ierr;
1207   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1208   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1209   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1210   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1211   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1212 #if defined(PETSC_USE_CTABLE)
1213   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1214 #else
1215   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1216 #endif
1217   const PetscInt *irow_i;
1218   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1219   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1220   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1221   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1222   MPI_Status     *r_status3,*r_status4,*s_status4;
1223   MPI_Comm       comm;
1224   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1225   PetscMPIInt    *onodes1,*olengths1;
1226   PetscMPIInt    idex,idex2,end;
1227 
1228   PetscFunctionBegin;
1229   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1230   tag0 = ((PetscObject)C)->tag;
1231   size = c->size;
1232   rank = c->rank;
1233 
1234   /* Get some new tags to keep the communication clean */
1235   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1236   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1237   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1238 
1239   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
1240 
1241   for (i=0; i<ismax; i++) {
1242     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
1243     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
1244     if (allcolumns[i]) {
1245       icol[i] = NULL;
1246       ncol[i] = C->cmap->N;
1247     } else {
1248       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
1249       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
1250     }
1251   }
1252 
1253   /* evaluate communication - mesg to who, length of mesg, and buffer space
1254      required. Based on this, buffers are allocated, and data copied into them*/
1255   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
1256   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1257   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1258   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1259   for (i=0; i<ismax; i++) {
1260     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
1261     jmax   = nrow[i];
1262     irow_i = irow[i];
1263     for (j=0; j<jmax; j++) {
1264       l   = 0;
1265       row = irow_i[j];
1266       while (row >= C->rmap->range[l+1]) l++;
1267       proc = l;
1268       w4[proc]++;
1269     }
1270     for (j=0; j<size; j++) {
1271       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1272     }
1273   }
1274 
1275   nrqs     = 0;              /* no of outgoing messages */
1276   msz      = 0;              /* total mesg length (for all procs) */
1277   w1[rank] = 0;              /* no mesg sent to self */
1278   w3[rank] = 0;
1279   for (i=0; i<size; i++) {
1280     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1281   }
1282   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1283   for (i=0,j=0; i<size; i++) {
1284     if (w1[i]) { pa[j] = i; j++; }
1285   }
1286 
1287   /* Each message would have a header = 1 + 2*(no of IS) + data */
1288   for (i=0; i<nrqs; i++) {
1289     j      = pa[i];
1290     w1[j] += w2[j] + 2* w3[j];
1291     msz   += w1[j];
1292   }
1293   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1294 
1295   /* Determine the number of messages to expect, their lengths, from from-ids */
1296   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1297   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1298 
1299   /* Now post the Irecvs corresponding to these messages */
1300   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1301 
1302   ierr = PetscFree(onodes1);CHKERRQ(ierr);
1303   ierr = PetscFree(olengths1);CHKERRQ(ierr);
1304 
1305   /* Allocate Memory for outgoing messages */
1306   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1307   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1308   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1309 
1310   {
1311     PetscInt *iptr = tmp,ict = 0;
1312     for (i=0; i<nrqs; i++) {
1313       j        = pa[i];
1314       iptr    += ict;
1315       sbuf1[j] = iptr;
1316       ict      = w1[j];
1317     }
1318   }
1319 
1320   /* Form the outgoing messages */
1321   /* Initialize the header space */
1322   for (i=0; i<nrqs; i++) {
1323     j           = pa[i];
1324     sbuf1[j][0] = 0;
1325     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
1326     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1327   }
1328 
1329   /* Parse the isrow and copy data into outbuf */
1330   for (i=0; i<ismax; i++) {
1331     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1332     irow_i = irow[i];
1333     jmax   = nrow[i];
1334     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1335       l   = 0;
1336       row = irow_i[j];
1337       while (row >= C->rmap->range[l+1]) l++;
1338       proc = l;
1339       if (proc != rank) { /* copy to the outgoing buf*/
1340         ctr[proc]++;
1341         *ptr[proc] = row;
1342         ptr[proc]++;
1343       }
1344     }
1345     /* Update the headers for the current IS */
1346     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1347       if ((ctr_j = ctr[j])) {
1348         sbuf1_j        = sbuf1[j];
1349         k              = ++sbuf1_j[0];
1350         sbuf1_j[2*k]   = ctr_j;
1351         sbuf1_j[2*k-1] = i;
1352       }
1353     }
1354   }
1355 
1356   /*  Now  post the sends */
1357   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1358   for (i=0; i<nrqs; ++i) {
1359     j    = pa[i];
1360     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
1361   }
1362 
1363   /* Post Receives to capture the buffer size */
1364   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
1365   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
1366   rbuf2[0] = tmp + msz;
1367   for (i=1; i<nrqs; ++i) {
1368     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1369   }
1370   for (i=0; i<nrqs; ++i) {
1371     j    = pa[i];
1372     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
1373   }
1374 
1375   /* Send to other procs the buf size they should allocate */
1376 
1377 
1378   /* Receive messages*/
1379   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
1380   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1381   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
1382   {
1383     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1384     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1385     PetscInt   *sbuf2_i;
1386 
1387     for (i=0; i<nrqr; ++i) {
1388       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1389 
1390       req_size[idex] = 0;
1391       rbuf1_i        = rbuf1[idex];
1392       start          = 2*rbuf1_i[0] + 1;
1393       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1394       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
1395       sbuf2_i        = sbuf2[idex];
1396       for (j=start; j<end; j++) {
1397         id              = rbuf1_i[j] - rstart;
1398         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1399         sbuf2_i[j]      = ncols;
1400         req_size[idex] += ncols;
1401       }
1402       req_source[idex] = r_status1[i].MPI_SOURCE;
1403       /* form the header */
1404       sbuf2_i[0] = req_size[idex];
1405       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1406 
1407       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1408     }
1409   }
1410   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1411   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1412 
1413   /*  recv buffer sizes */
1414   /* Receive messages*/
1415 
1416   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1417   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1418   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1419   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1420   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1421 
1422   for (i=0; i<nrqs; ++i) {
1423     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1424     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1425     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1426     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1427     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1428   }
1429   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1430   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1431 
1432   /* Wait on sends1 and sends2 */
1433   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1434   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1435 
1436   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1437   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1438   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1439   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1440   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1441   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1442 
1443   /* Now allocate buffers for a->j, and send them off */
1444   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1445   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1446   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1447   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1448 
1449   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1450   {
1451     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1452     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1453     PetscInt cend = C->cmap->rend;
1454     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1455 
1456     for (i=0; i<nrqr; i++) {
1457       rbuf1_i   = rbuf1[i];
1458       sbuf_aj_i = sbuf_aj[i];
1459       ct1       = 2*rbuf1_i[0] + 1;
1460       ct2       = 0;
1461       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1462         kmax = rbuf1[i][2*j];
1463         for (k=0; k<kmax; k++,ct1++) {
1464           row    = rbuf1_i[ct1] - rstart;
1465           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1466           ncols  = nzA + nzB;
1467           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1468 
1469           /* load the column indices for this row into cols*/
1470           cols = sbuf_aj_i + ct2;
1471 
1472           lwrite = 0;
1473           for (l=0; l<nzB; l++) {
1474             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1475           }
1476           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1477           for (l=0; l<nzB; l++) {
1478             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1479           }
1480 
1481           ct2 += ncols;
1482         }
1483       }
1484       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1485     }
1486   }
1487   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1488   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1489 
1490   /* Allocate buffers for a->a, and send them off */
1491   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1492   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1493   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1494   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1495 
1496   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1497   {
1498     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1499     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1500     PetscInt    cend   = C->cmap->rend;
1501     PetscInt    *b_j   = b->j;
1502     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1503 
1504     for (i=0; i<nrqr; i++) {
1505       rbuf1_i   = rbuf1[i];
1506       sbuf_aa_i = sbuf_aa[i];
1507       ct1       = 2*rbuf1_i[0]+1;
1508       ct2       = 0;
1509       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1510         kmax = rbuf1_i[2*j];
1511         for (k=0; k<kmax; k++,ct1++) {
1512           row    = rbuf1_i[ct1] - rstart;
1513           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1514           ncols  = nzA + nzB;
1515           cworkB = b_j + b_i[row];
1516           vworkA = a_a + a_i[row];
1517           vworkB = b_a + b_i[row];
1518 
1519           /* load the column values for this row into vals*/
1520           vals = sbuf_aa_i+ct2;
1521 
1522           lwrite = 0;
1523           for (l=0; l<nzB; l++) {
1524             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1525           }
1526           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1527           for (l=0; l<nzB; l++) {
1528             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1529           }
1530 
1531           ct2 += ncols;
1532         }
1533       }
1534       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1535     }
1536   }
1537   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1538   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1539   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1540   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1541 
1542   /* Form the matrix */
1543   /* create col map: global col of C -> local col of submatrices */
1544   {
1545     const PetscInt *icol_i;
1546 #if defined(PETSC_USE_CTABLE)
1547     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1548     for (i=0; i<ismax; i++) {
1549       if (!allcolumns[i]) {
1550         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1551 
1552         jmax   = ncol[i];
1553         icol_i = icol[i];
1554         cmap_i = cmap[i];
1555         for (j=0; j<jmax; j++) {
1556           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1557         }
1558       } else {
1559         cmap[i] = NULL;
1560       }
1561     }
1562 #else
1563     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1564     for (i=0; i<ismax; i++) {
1565       if (!allcolumns[i]) {
1566         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1567         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1568         jmax   = ncol[i];
1569         icol_i = icol[i];
1570         cmap_i = cmap[i];
1571         for (j=0; j<jmax; j++) {
1572           cmap_i[icol_i[j]] = j+1;
1573         }
1574       } else {
1575         cmap[i] = NULL;
1576       }
1577     }
1578 #endif
1579   }
1580 
1581   /* Create lens which is required for MatCreate... */
1582   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1583   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1584   if (ismax) {
1585     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1586     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1587   }
1588   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1589 
1590   /* Update lens from local data */
1591   for (i=0; i<ismax; i++) {
1592     jmax = nrow[i];
1593     if (!allcolumns[i]) cmap_i = cmap[i];
1594     irow_i = irow[i];
1595     lens_i = lens[i];
1596     for (j=0; j<jmax; j++) {
1597       l   = 0;
1598       row = irow_i[j];
1599       while (row >= C->rmap->range[l+1]) l++;
1600       proc = l;
1601       if (proc == rank) {
1602         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1603         if (!allcolumns[i]) {
1604           for (k=0; k<ncols; k++) {
1605 #if defined(PETSC_USE_CTABLE)
1606             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1607 #else
1608             tcol = cmap_i[cols[k]];
1609 #endif
1610             if (tcol) lens_i[j]++;
1611           }
1612         } else { /* allcolumns */
1613           lens_i[j] = ncols;
1614         }
1615         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1616       }
1617     }
1618   }
1619 
1620   /* Create row map: global row of C -> local row of submatrices */
1621 #if defined(PETSC_USE_CTABLE)
1622   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1623   for (i=0; i<ismax; i++) {
1624     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1625     rmap_i = rmap[i];
1626     irow_i = irow[i];
1627     jmax   = nrow[i];
1628     for (j=0; j<jmax; j++) {
1629       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1630     }
1631   }
1632 #else
1633   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1634   if (ismax) {
1635     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1636     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1637   }
1638   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1639   for (i=0; i<ismax; i++) {
1640     rmap_i = rmap[i];
1641     irow_i = irow[i];
1642     jmax   = nrow[i];
1643     for (j=0; j<jmax; j++) {
1644       rmap_i[irow_i[j]] = j;
1645     }
1646   }
1647 #endif
1648 
1649   /* Update lens from offproc data */
1650   {
1651     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1652 
1653     for (tmp2=0; tmp2<nrqs; tmp2++) {
1654       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1655       idex    = pa[idex2];
1656       sbuf1_i = sbuf1[idex];
1657       jmax    = sbuf1_i[0];
1658       ct1     = 2*jmax+1;
1659       ct2     = 0;
1660       rbuf2_i = rbuf2[idex2];
1661       rbuf3_i = rbuf3[idex2];
1662       for (j=1; j<=jmax; j++) {
1663         is_no  = sbuf1_i[2*j-1];
1664         max1   = sbuf1_i[2*j];
1665         lens_i = lens[is_no];
1666         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1667         rmap_i = rmap[is_no];
1668         for (k=0; k<max1; k++,ct1++) {
1669 #if defined(PETSC_USE_CTABLE)
1670           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1671           row--;
1672           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1673 #else
1674           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1675 #endif
1676           max2 = rbuf2_i[ct1];
1677           for (l=0; l<max2; l++,ct2++) {
1678             if (!allcolumns[is_no]) {
1679 #if defined(PETSC_USE_CTABLE)
1680               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1681 #else
1682               tcol = cmap_i[rbuf3_i[ct2]];
1683 #endif
1684               if (tcol) lens_i[row]++;
1685             } else { /* allcolumns */
1686               lens_i[row]++; /* lens_i[row] += max2 ? */
1687             }
1688           }
1689         }
1690       }
1691     }
1692   }
1693   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1694   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1695   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1696   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1697   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1698 
1699   /* Create the submatrices */
1700   if (scall == MAT_REUSE_MATRIX) {
1701     PetscBool flag;
1702 
1703     /*
1704         Assumes new rows are same length as the old rows,hence bug!
1705     */
1706     for (i=0; i<ismax; i++) {
1707       mat = (Mat_SeqAIJ*)(submats[i]->data);
1708       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1709       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1710       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1711       /* Initial matrix as if empty */
1712       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1713 
1714       submats[i]->factortype = C->factortype;
1715     }
1716   } else {
1717     for (i=0; i<ismax; i++) {
1718       PetscInt rbs,cbs;
1719       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1720       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1721 
1722       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1723       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1724 
1725       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1726       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1727       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1728     }
1729   }
1730 
1731   /* Assemble the matrices */
1732   /* First assemble the local rows */
1733   {
1734     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1735     PetscScalar *imat_a;
1736 
1737     for (i=0; i<ismax; i++) {
1738       mat       = (Mat_SeqAIJ*)submats[i]->data;
1739       imat_ilen = mat->ilen;
1740       imat_j    = mat->j;
1741       imat_i    = mat->i;
1742       imat_a    = mat->a;
1743 
1744       if (!allcolumns[i]) cmap_i = cmap[i];
1745       rmap_i = rmap[i];
1746       irow_i = irow[i];
1747       jmax   = nrow[i];
1748       for (j=0; j<jmax; j++) {
1749         l   = 0;
1750         row = irow_i[j];
1751         while (row >= C->rmap->range[l+1]) l++;
1752         proc = l;
1753         if (proc == rank) {
1754           old_row = row;
1755 #if defined(PETSC_USE_CTABLE)
1756           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1757           row--;
1758 #else
1759           row = rmap_i[row];
1760 #endif
1761           ilen_row = imat_ilen[row];
1762           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1763           mat_i    = imat_i[row];
1764           mat_a    = imat_a + mat_i;
1765           mat_j    = imat_j + mat_i;
1766           if (!allcolumns[i]) {
1767             for (k=0; k<ncols; k++) {
1768 #if defined(PETSC_USE_CTABLE)
1769               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1770 #else
1771               tcol = cmap_i[cols[k]];
1772 #endif
1773               if (tcol) {
1774                 *mat_j++ = tcol - 1;
1775                 *mat_a++ = vals[k];
1776                 ilen_row++;
1777               }
1778             }
1779           } else { /* allcolumns */
1780             for (k=0; k<ncols; k++) {
1781               *mat_j++ = cols[k];  /* global col index! */
1782               *mat_a++ = vals[k];
1783               ilen_row++;
1784             }
1785           }
1786           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1787 
1788           imat_ilen[row] = ilen_row;
1789         }
1790       }
1791     }
1792   }
1793 
1794   /*   Now assemble the off proc rows*/
1795   {
1796     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1797     PetscInt    *imat_j,*imat_i;
1798     PetscScalar *imat_a,*rbuf4_i;
1799 
1800     for (tmp2=0; tmp2<nrqs; tmp2++) {
1801       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1802       idex    = pa[idex2];
1803       sbuf1_i = sbuf1[idex];
1804       jmax    = sbuf1_i[0];
1805       ct1     = 2*jmax + 1;
1806       ct2     = 0;
1807       rbuf2_i = rbuf2[idex2];
1808       rbuf3_i = rbuf3[idex2];
1809       rbuf4_i = rbuf4[idex2];
1810       for (j=1; j<=jmax; j++) {
1811         is_no     = sbuf1_i[2*j-1];
1812         rmap_i    = rmap[is_no];
1813         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1814         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1815         imat_ilen = mat->ilen;
1816         imat_j    = mat->j;
1817         imat_i    = mat->i;
1818         imat_a    = mat->a;
1819         max1      = sbuf1_i[2*j];
1820         for (k=0; k<max1; k++,ct1++) {
1821           row = sbuf1_i[ct1];
1822 #if defined(PETSC_USE_CTABLE)
1823           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1824           row--;
1825 #else
1826           row = rmap_i[row];
1827 #endif
1828           ilen  = imat_ilen[row];
1829           mat_i = imat_i[row];
1830           mat_a = imat_a + mat_i;
1831           mat_j = imat_j + mat_i;
1832           max2  = rbuf2_i[ct1];
1833           if (!allcolumns[is_no]) {
1834             for (l=0; l<max2; l++,ct2++) {
1835 
1836 #if defined(PETSC_USE_CTABLE)
1837               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1838 #else
1839               tcol = cmap_i[rbuf3_i[ct2]];
1840 #endif
1841               if (tcol) {
1842                 *mat_j++ = tcol - 1;
1843                 *mat_a++ = rbuf4_i[ct2];
1844                 ilen++;
1845               }
1846             }
1847           } else { /* allcolumns */
1848             for (l=0; l<max2; l++,ct2++) {
1849               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1850               *mat_a++ = rbuf4_i[ct2];
1851               ilen++;
1852             }
1853           }
1854           imat_ilen[row] = ilen;
1855         }
1856       }
1857     }
1858   }
1859 
1860   /* sort the rows */
1861   {
1862     PetscInt    *imat_ilen,*imat_j,*imat_i;
1863     PetscScalar *imat_a;
1864 
1865     for (i=0; i<ismax; i++) {
1866       mat       = (Mat_SeqAIJ*)submats[i]->data;
1867       imat_j    = mat->j;
1868       imat_i    = mat->i;
1869       imat_a    = mat->a;
1870       imat_ilen = mat->ilen;
1871 
1872       if (allcolumns[i]) continue;
1873       jmax = nrow[i];
1874       for (j=0; j<jmax; j++) {
1875         PetscInt ilen;
1876 
1877         mat_i = imat_i[j];
1878         mat_a = imat_a + mat_i;
1879         mat_j = imat_j + mat_i;
1880         ilen  = imat_ilen[j];
1881         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
1882       }
1883     }
1884   }
1885 
1886   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1887   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1888   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1889   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1890   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1891 
1892   /* Restore the indices */
1893   for (i=0; i<ismax; i++) {
1894     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1895     if (!allcolumns[i]) {
1896       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1897     }
1898   }
1899 
1900   /* Destroy allocated memory */
1901   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1902   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1903   ierr = PetscFree(pa);CHKERRQ(ierr);
1904 
1905   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1906   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1907   for (i=0; i<nrqr; ++i) {
1908     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1909   }
1910   for (i=0; i<nrqs; ++i) {
1911     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1912     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1913   }
1914 
1915   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1916   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1917   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1918   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1919   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1920   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1921   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1922 
1923 #if defined(PETSC_USE_CTABLE)
1924   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1925 #else
1926   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1927 #endif
1928   ierr = PetscFree(rmap);CHKERRQ(ierr);
1929 
1930   for (i=0; i<ismax; i++) {
1931     if (!allcolumns[i]) {
1932 #if defined(PETSC_USE_CTABLE)
1933       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1934 #else
1935       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1936 #endif
1937     }
1938   }
1939   ierr = PetscFree(cmap);CHKERRQ(ierr);
1940   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1941   ierr = PetscFree(lens);CHKERRQ(ierr);
1942 
1943   for (i=0; i<ismax; i++) {
1944     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1945     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1946   }
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 /*
1951  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
1952  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
1953  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
1954  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
1955  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
1956  state, and needs to be "assembled" later by compressing B's column space.
1957 
1958  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
1959  Following this call, C->A & C->B have been created, even if empty.
1960  */
1961 #undef __FUNCT__
1962 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
1963 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
1964 {
1965   /* If making this function public, change the error returned in this function away from _PLIB. */
1966   PetscErrorCode ierr;
1967   Mat_MPIAIJ     *aij;
1968   Mat_SeqAIJ     *Baij;
1969   PetscBool      seqaij,Bdisassembled;
1970   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
1971   PetscScalar    v;
1972   const PetscInt *rowindices,*colindices;
1973 
1974   PetscFunctionBegin;
1975   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
1976   if (A) {
1977     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
1978     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
1979     if (rowemb) {
1980       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
1981       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
1982     } else {
1983       if (C->rmap->n != A->rmap->n) {
1984 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
1985       }
1986     }
1987     if (dcolemb) {
1988       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
1989       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
1990     } else {
1991       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
1992     }
1993   }
1994   if (B) {
1995     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
1996     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
1997     if (rowemb) {
1998       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
1999       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2000     } else {
2001       if (C->rmap->n != B->rmap->n) {
2002 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2003       }
2004     }
2005     if (ocolemb) {
2006       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2007       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2008     } else {
2009       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2010     }
2011   }
2012 
2013   aij    = (Mat_MPIAIJ*)(C->data);
2014   if (!aij->A) {
2015     /* Mimic parts of MatMPIAIJSetPreallocation() */
2016     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2017     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2018     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2019     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2020     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2021   }
2022   if (A) {
2023     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2024   } else {
2025     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2026   }
2027   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2028     /*
2029       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2030       need to "disassemble" B -- convert it to using C's global indices.
2031       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2032 
2033       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2034       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2035 
2036       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2037       At least avoid calling MatSetValues() and the implied searches?
2038     */
2039 
2040     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2041 #if defined(PETSC_USE_CTABLE)
2042       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2043 #else
2044       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2045       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2046       if (aij->B) {
2047         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2048       }
2049 #endif
2050       ngcol = 0;
2051       if (aij->lvec) {
2052 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2053       }
2054       if (aij->garray) {
2055 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2056 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2057       }
2058       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2059       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2060     }
2061     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2062       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2063     }
2064     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2065       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2066     }
2067   }
2068   Bdisassembled = PETSC_FALSE;
2069   if (!aij->B) {
2070     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2071     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2072     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2073     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2074     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2075     Bdisassembled = PETSC_TRUE;
2076   }
2077   if (B) {
2078     Baij = (Mat_SeqAIJ*)(B->data);
2079     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2080       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2081       for (i=0; i<B->rmap->n; i++) {
2082 	nz[i] = Baij->i[i+1] - Baij->i[i];
2083       }
2084       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2085       ierr = PetscFree(nz);CHKERRQ(ierr);
2086     }
2087 
2088     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
2089     shift = rend-rstart;
2090     count = 0;
2091     rowindices = NULL;
2092     colindices = NULL;
2093     if (rowemb) {
2094       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
2095     }
2096     if (ocolemb) {
2097       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
2098     }
2099     for (i=0; i<B->rmap->n; i++) {
2100       PetscInt row;
2101       row = i;
2102       if (rowindices) row = rowindices[i];
2103       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2104 	col  = Baij->j[count];
2105 	if (colindices) col = colindices[col];
2106 	if (Bdisassembled && col>=rstart) col += shift;
2107 	v    = Baij->a[count];
2108 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
2109 	++count;
2110       }
2111     }
2112     /* No assembly for aij->B is necessary. */
2113     /* FIXME: set aij->B's nonzerostate correctly. */
2114   } else {
2115     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
2116   }
2117   C->preallocated  = PETSC_TRUE;
2118   C->was_assembled = PETSC_FALSE;
2119   C->assembled     = PETSC_FALSE;
2120    /*
2121       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2122       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2123    */
2124   PetscFunctionReturn(0);
2125 }
2126 
2127 #undef __FUNCT__
2128 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
2129 /*
2130   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2131  */
2132 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
2133 {
2134   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
2135 
2136   PetscFunctionBegin;
2137   PetscValidPointer(A,2);
2138   PetscValidPointer(B,3);
2139   /* FIXME: make sure C is assembled */
2140   *A = aij->A;
2141   *B = aij->B;
2142   /* Note that we don't incref *A and *B, so be careful! */
2143   PetscFunctionReturn(0);
2144 }
2145 
2146 /*
2147   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2148   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2149 */
2150 #undef __FUNCT__
2151 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
2152 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
2153                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
2154 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
2155 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
2156 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
2157 {
2158   PetscErrorCode ierr;
2159   PetscMPIInt    isize,flag;
2160   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
2161   Mat            *A,*B;
2162   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
2163 
2164   PetscFunctionBegin;
2165   if (!ismax) PetscFunctionReturn(0);
2166 
2167   for (i = 0, cismax = 0; i < ismax; ++i) {
2168     PetscMPIInt isize;
2169     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
2170     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2171     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
2172     if (isize > 1) ++cismax;
2173   }
2174   /*
2175      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
2176      ispar counts the number of parallel ISs across C's comm.
2177   */
2178   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2179   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
2180     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2181     PetscFunctionReturn(0);
2182   }
2183 
2184   /* if (ispar) */
2185   /*
2186     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
2187     These are used to extract the off-diag portion of the resulting parallel matrix.
2188     The row IS for the off-diag portion is the same as for the diag portion,
2189     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
2190   */
2191   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
2192   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
2193   for (i = 0, ii = 0; i < ismax; ++i) {
2194     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2195     if (isize > 1) {
2196       /*
2197 	 TODO: This is the part that's ***NOT SCALABLE***.
2198 	 To fix this we need to extract just the indices of C's nonzero columns
2199 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
2200 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
2201 	 to be done without serializing on the IS list, so, most likely, it is best
2202 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
2203       */
2204       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
2205       /* Now we have to
2206 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
2207 	     were sorted on each rank, concatenated they might no longer be sorted;
2208 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
2209 	     indices in the nondecreasing order to the original index positions.
2210 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
2211       */
2212       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
2213       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
2214       ++ii;
2215     }
2216   }
2217   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
2218   for (i = 0, ii = 0; i < ismax; ++i) {
2219     PetscInt       j,issize;
2220     const PetscInt *indices;
2221 
2222     /*
2223        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
2224      */
2225     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
2226     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
2227     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
2228     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
2229     for (j = 1; j < issize; ++j) {
2230       if (indices[j] == indices[j-1]) {
2231 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2232       }
2233     }
2234     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
2235 
2236 
2237     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
2238     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
2239     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
2240     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
2241     for (j = 1; j < issize; ++j) {
2242       if (indices[j-1] == indices[j]) {
2243 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2244       }
2245     }
2246     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
2247     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2248     if (isize > 1) {
2249       cisrow[ii] = isrow[i];
2250       ++ii;
2251     }
2252   }
2253   /*
2254     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2255     array of sequential matrices underlying the resulting parallel matrices.
2256     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
2257     contain duplicates.
2258 
2259     There are as many diag matrices as there are original index sets. There are only as many parallel
2260     and off-diag matrices, as there are parallel (comm size > 1) index sets.
2261 
2262     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
2263     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
2264       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
2265       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
2266       with A[i] and B[ii] extracted from the corresponding MPI submat.
2267     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
2268       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
2269       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
2270       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
2271       values into A[i] and B[ii] sitting inside the corresponding submat.
2272     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
2273       A[i], B[ii], AA[i] or BB[ii] matrices.
2274   */
2275   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
2276   if (scall == MAT_INITIAL_MATRIX) {
2277     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
2278     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
2279   } else {
2280     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
2281     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
2282     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
2283     for (i = 0, ii = 0; i < ismax; ++i) {
2284       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2285       if (isize > 1) {
2286 	Mat AA,BB;
2287 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
2288 	if (!isrow_p[i] && !iscol_p[i]) {
2289 	  A[i] = AA;
2290 	} else {
2291 	  /* TODO: extract A[i] composed on AA. */
2292 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
2293 	}
2294 	if (!isrow_p[i] && !ciscol_p[ii]) {
2295 	  B[ii] = BB;
2296 	} else {
2297 	  /* TODO: extract B[ii] composed on BB. */
2298 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
2299 	}
2300 	++ii;
2301       } else {
2302 	if (!isrow_p[i] && !iscol_p[i]) {
2303 	  A[i] = (*submat)[i];
2304 	} else {
2305 	  /* TODO: extract A[i] composed on (*submat)[i]. */
2306 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
2307 	}
2308       }
2309     }
2310   }
2311   /* Now obtain the sequential A and B submatrices separately. */
2312   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
2313   /* I did not figure out a good way to fix it right now.
2314    * Local column size of B[i] is different from the size of ciscol[i].
2315    * B[i]'s size is finally determined by MatAssembly, while
2316    * ciscol[i] is computed as the complement of iscol[i].
2317    * It is better to keep only nonzero indices when computing
2318    * the complement ciscol[i].
2319    * */
2320   if(scall==MAT_REUSE_MATRIX){
2321 	for(i=0; i<cismax; i++){
2322 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
2323 	  B[i]->cmap->n = ciscol_localsize;
2324 	}
2325   }
2326   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
2327   /*
2328     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
2329     matrices A & B have been extracted directly into the parallel matrices containing them, or
2330     simply into the sequential matrix identical with the corresponding A (if isize == 1).
2331     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
2332     to have the same sparsity pattern.
2333     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
2334     must be constructed for C. This is done by setseqmat(s).
2335   */
2336   for (i = 0, ii = 0; i < ismax; ++i) {
2337     /*
2338        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
2339        That way we can avoid sorting and computing permutations when reusing.
2340        To this end:
2341         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
2342 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
2343 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
2344     */
2345     MatStructure pattern;
2346     if (scall == MAT_INITIAL_MATRIX) {
2347       pattern = DIFFERENT_NONZERO_PATTERN;
2348     } else {
2349       pattern = SAME_NONZERO_PATTERN;
2350     }
2351     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2352     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
2353     if (isize > 1) {
2354       if (scall == MAT_INITIAL_MATRIX) {
2355 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
2356 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2357 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
2358 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
2359 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
2360       }
2361       /*
2362 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
2363       */
2364       {
2365 	Mat AA,BB;
2366 	AA = NULL;
2367 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
2368 	BB = NULL;
2369 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
2370 	if (AA || BB) {
2371 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
2372 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2373 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2374 	}
2375 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
2376 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
2377 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
2378 	}
2379 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
2380 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
2381 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
2382 	}
2383       }
2384       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
2385       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
2386       ++ii;
2387     } else { /* if (isize == 1) */
2388       if (scall == MAT_INITIAL_MATRIX) {
2389 	if (isrow_p[i] || iscol_p[i]) {
2390 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
2391 	} else (*submat)[i] = A[i];
2392       }
2393       if (isrow_p[i] || iscol_p[i]) {
2394 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
2395 	/* Otherwise A is extracted straight into (*submats)[i]. */
2396 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
2397 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
2398       }
2399     }
2400     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
2401     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
2402   }
2403   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
2404   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
2405   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
2406   ierr = PetscFree(A);CHKERRQ(ierr);
2407   ierr = PetscFree(B);CHKERRQ(ierr);
2408   PetscFunctionReturn(0);
2409 }
2410 
2411 
2412 
2413 #undef __FUNCT__
2414 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
2415 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2416 {
2417   PetscErrorCode ierr;
2418 
2419   PetscFunctionBegin;
2420   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
2421   PetscFunctionReturn(0);
2422 }
2423