xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 95dccacacae8a8fc0b691f9b4fba69a249b61188)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 #include <../src/mat/impls/aij/mpi/mpiaij.h>
8 #include <petscbt.h>
9 #include <petscsf.h>
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
20 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
21 
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
25 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
26 {
27   PetscErrorCode ierr;
28   PetscInt       i;
29 
30   PetscFunctionBegin;
31   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
32   for (i=0; i<ov; ++i) {
33     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
34   }
35   PetscFunctionReturn(0);
36 }
37 
38 #undef __FUNCT__
39 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
40 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
41 {
42   PetscErrorCode ierr;
43   PetscInt       i;
44 
45   PetscFunctionBegin;
46   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
47   for (i=0; i<ov; ++i) {
48     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
49   }
50   PetscFunctionReturn(0);
51 }
52 
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
57 {
58   PetscErrorCode   ierr;
59   MPI_Comm         comm;
60   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
61   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
62   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
63   const PetscInt  *indices_i,**indices;
64   PetscLayout      rmap;
65   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
66   PetscSF          sf;
67   PetscSFNode     *remote;
68 
69   PetscFunctionBegin;
70   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
71   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
72   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
73   /* get row map to determine where rows should be going */
74   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
75   /* retrieve IS data and put all together so that we
76    * can optimize communication
77    *  */
78   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
79   for (i=0,tlength=0; i<nidx; i++){
80     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
81     tlength += length[i];
82     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
83   }
84   /* find these rows on remote processors */
85   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
86   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
87   nrrows = 0;
88   for (i=0; i<nidx; i++){
89     length_i     = length[i];
90     indices_i    = indices[i];
91     for (j=0; j<length_i; j++){
92       owner = -1;
93       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
94       /* remote processors */
95       if (owner != rank){
96         tosizes_temp[owner]++; /* number of rows to owner */
97         rrow_ranks[nrrows]  = owner; /* processor */
98         rrow_isids[nrrows]   = i; /* is id */
99         remoterows[nrrows++] = indices_i[j]; /* row */
100       }
101     }
102     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
103   }
104   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
105   /* test if we need to exchange messages
106    * generally speaking, we do not need to exchange
107    * data when overlap is 1
108    * */
109   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
110   /* we do not have any messages
111    * It usually corresponds to overlap 1
112    * */
113   if (!reducednrrows){
114     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
115     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
116     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
117     PetscFunctionReturn(0);
118   }
119   nto = 0;
120   /* send sizes and ranks for building a two-sided communcation */
121   for (i=0; i<size; i++){
122    if (tosizes_temp[i]){
123      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
124      tosizes_temp[i] = nto; /* a map from processor to index */
125      toranks[nto++]  = i; /* processor */
126    }
127   }
128   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
129   for (i=0; i<nto; i++){
130     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
131     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
132   }
133   /* send information to other processors */
134   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
135   nrecvrows = 0;
136   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
137   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
138   nrecvrows = 0;
139   for (i=0; i<nfrom; i++){
140     for (j=0; j<fromsizes[2*i]; j++){
141       remote[nrecvrows].rank    = fromranks[i];
142       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
143     }
144   }
145   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
146   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
147   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
148   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
149   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
150   /* message pair <no of is, row>  */
151   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
152   for (i=0; i<nrrows; i++){
153     owner = rrow_ranks[i]; /* processor */
154     j     = tosizes_temp[owner]; /* index */
155     todata[toffsets[j]++] = rrow_isids[i];
156     todata[toffsets[j]++] = remoterows[i];
157   }
158   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
159   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
160   ierr = PetscFree(toffsets);CHKERRQ(ierr);
161   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
162   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
163   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
164   /* send rows belonging to the remote so that then we could get the overlapping data back */
165   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
166   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
167   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
168   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
169   ierr = PetscFree(fromranks);CHKERRQ(ierr);
170   nrecvrows = 0;
171   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
172   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
173   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
174   nrecvrows = 0;
175   for (i=0; i<nto; i++){
176     for (j=0; j<tosizes[2*i]; j++){
177       remote[nrecvrows].rank    = toranks[i];
178       remote[nrecvrows++].index = tosizes[2*i+1]+j;
179     }
180   }
181   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
182   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
183   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
184   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
185   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
186   /* overlap communication and computation */
187   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
188   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
189   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
190   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
191   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
192   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
193   ierr = PetscFree(toranks);CHKERRQ(ierr);
194   ierr = PetscFree(tosizes);CHKERRQ(ierr);
195   ierr = PetscFree(todata);CHKERRQ(ierr);
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
201 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
202 {
203   PetscInt         *isz,isz_i,i,j,is_id, data_size;
204   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
205   const PetscInt   *indices_i_temp;
206   PetscErrorCode    ierr;
207 
208   PetscFunctionBegin;
209   max_lsize = 0;
210   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
211   for (i=0; i<nidx; i++){
212     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
213     max_lsize = lsize>max_lsize ? lsize:max_lsize;
214     isz[i]    = lsize;
215   }
216   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
217   for (i=0; i<nidx; i++){
218     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
219     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
220     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
221     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
222   }
223   /* retrieve information to get row id and its overlap */
224   for (i=0; i<nrecvs; ){
225     is_id      = recvdata[i++];
226     data_size  = recvdata[i++];
227     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
228     isz_i      = isz[is_id];
229     for (j=0; j< data_size; j++){
230       col = recvdata[i++];
231       indices_i[isz_i++] = col;
232     }
233     isz[is_id] = isz_i;
234   }
235   /* remove duplicate entities */
236   for (i=0; i<nidx; i++){
237     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
238     isz_i      = isz[i];
239     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
240     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
241   }
242   ierr = PetscFree(isz);CHKERRQ(ierr);
243   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
244   PetscFunctionReturn(0);
245 }
246 
247 #undef __FUNCT__
248 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
249 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
250 {
251   PetscLayout       rmap,cmap;
252   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
253   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
254   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
255   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
256   Mat               amat,bmat;
257   PetscMPIInt       rank;
258   PetscBool         done;
259   MPI_Comm          comm;
260   PetscErrorCode    ierr;
261 
262   PetscFunctionBegin;
263   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
264   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
265   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
266   /* Even if the mat is symmetric, we still assume it is not symmetric */
267   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
268   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
269   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
270   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
271   /* total number of nonzero values is used to estimate the memory usage in the next step */
272   tnz  = ai[an]+bi[bn];
273   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
274   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
275   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
276   /* to find the longest message */
277   max_fszs = 0;
278   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
279   /* better way to estimate number of nonzero in the mat??? */
280   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
281   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
282   rows_pos  = 0;
283   totalrows = 0;
284   for (i=0; i<nfrom; i++){
285     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
286     /* group data together */
287     for (j=0; j<fromsizes[2*i]; j+=2){
288       is_id                       = fromrows[rows_pos++];/* no of is */
289       rows_i                      = rows_data[is_id];
290       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
291     }
292     /* estimate a space to avoid multiple allocations  */
293     for (j=0; j<nidx; j++){
294       indvc_ij = 0;
295       rows_i   = rows_data[j];
296       for (l=0; l<rows_pos_i[j]; l++){
297         row    = rows_i[l]-rstart;
298         start  = ai[row];
299         end    = ai[row+1];
300         for (k=start; k<end; k++){ /* Amat */
301           col = aj[k] + cstart;
302           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
303         }
304         start = bi[row];
305         end   = bi[row+1];
306         for (k=start; k<end; k++) { /* Bmat */
307           col = gcols[bj[k]];
308           indices_tmp[indvc_ij++] = col;
309         }
310       }
311       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
312       indv_counts[i*nidx+j] = indvc_ij;
313       totalrows            += indvc_ij;
314     }
315   }
316   /* message triple <no of is, number of rows, rows> */
317   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
318   totalrows = 0;
319   rows_pos  = 0;
320   /* use this code again */
321   for (i=0;i<nfrom;i++){
322     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
323     for (j=0; j<fromsizes[2*i]; j+=2){
324       is_id                       = fromrows[rows_pos++];
325       rows_i                      = rows_data[is_id];
326       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
327     }
328     /* add data  */
329     for (j=0; j<nidx; j++){
330       if (!indv_counts[i*nidx+j]) continue;
331       indvc_ij = 0;
332       sbdata[totalrows++] = j;
333       sbdata[totalrows++] = indv_counts[i*nidx+j];
334       sbsizes[2*i]       += 2;
335       rows_i              = rows_data[j];
336       for (l=0; l<rows_pos_i[j]; l++){
337         row   = rows_i[l]-rstart;
338         start = ai[row];
339         end   = ai[row+1];
340         for (k=start; k<end; k++){ /* Amat */
341           col = aj[k] + cstart;
342           indices_tmp[indvc_ij++] = col;
343         }
344         start = bi[row];
345         end   = bi[row+1];
346         for (k=start; k<end; k++) { /* Bmat */
347           col = gcols[bj[k]];
348           indices_tmp[indvc_ij++] = col;
349         }
350       }
351       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
352       sbsizes[2*i]  += indvc_ij;
353       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
354       totalrows += indvc_ij;
355     }
356   }
357   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
358   for (i=0; i<nfrom; i++){
359     offsets[i+1]   = offsets[i] + sbsizes[2*i];
360     sbsizes[2*i+1] = offsets[i];
361   }
362   ierr = PetscFree(offsets);CHKERRQ(ierr);
363   if (sbrowsizes) *sbrowsizes = sbsizes;
364   if (sbrows) *sbrows = sbdata;
365   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
367   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
368   PetscFunctionReturn(0);
369 }
370 
371 #undef __FUNCT__
372 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
373 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
374 {
375   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
376   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
377   PetscInt          lsize,lsize_tmp,owner;
378   PetscMPIInt       rank;
379   Mat               amat,bmat;
380   PetscBool         done;
381   PetscLayout       cmap,rmap;
382   MPI_Comm          comm;
383   PetscErrorCode    ierr;
384 
385   PetscFunctionBegin;
386   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
387   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
388   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
389   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
390   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
391   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
392   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
393   /* is it a safe way to compute number of nonzero values ? */
394   tnz  = ai[an]+bi[bn];
395   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
397   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
398   /* it is a better way to estimate memory than the old implementation
399    * where global size of matrix is used
400    * */
401   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
402   for (i=0; i<nidx; i++) {
403     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
404     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
405     lsize_tmp = 0;
406     for (j=0; j<lsize; j++) {
407       owner = -1;
408       row   = indices[j];
409       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
410       if (owner != rank) continue;
411       /* local number */
412       row  -= rstart;
413       start = ai[row];
414       end   = ai[row+1];
415       for (k=start; k<end; k++) { /* Amat */
416         col = aj[k] + cstart;
417         indices_temp[lsize_tmp++] = col;
418       }
419       start = bi[row];
420       end   = bi[row+1];
421       for (k=start; k<end; k++) { /* Bmat */
422         col = gcols[bj[k]];
423         indices_temp[lsize_tmp++] = col;
424       }
425     }
426    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
427    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
428    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
429    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
430   }
431   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
433   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
434   PetscFunctionReturn(0);
435 }
436 
437 
438 /*
439   Sample message format:
440   If a processor A wants processor B to process some elements corresponding
441   to index sets is[1],is[5]
442   mesg [0] = 2   (no of index sets in the mesg)
443   -----------
444   mesg [1] = 1 => is[1]
445   mesg [2] = sizeof(is[1]);
446   -----------
447   mesg [3] = 5  => is[5]
448   mesg [4] = sizeof(is[5]);
449   -----------
450   mesg [5]
451   mesg [n]  datas[1]
452   -----------
453   mesg[n+1]
454   mesg[m]  data(is[5])
455   -----------
456 
457   Notes:
458   nrqs - no of requests sent (or to be sent out)
459   nrqr - no of requests recieved (which have to be or which have been processed
460 */
461 #undef __FUNCT__
462 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
463 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
464 {
465   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
466   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
467   const PetscInt **idx,*idx_i;
468   PetscInt       *n,**data,len;
469   PetscErrorCode ierr;
470   PetscMPIInt    size,rank,tag1,tag2;
471   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
472   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
473   PetscBT        *table;
474   MPI_Comm       comm;
475   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476   MPI_Status     *s_status,*recv_status;
477   char           *t_p;
478 
479   PetscFunctionBegin;
480   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
481   size = c->size;
482   rank = c->rank;
483   M    = C->rmap->N;
484 
485   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
486   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
487 
488   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
489 
490   for (i=0; i<imax; i++) {
491     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
492     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
493   }
494 
495   /* evaluate communication - mesg to who,length of mesg, and buffer space
496      required. Based on this, buffers are allocated, and data copied into them  */
497   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
498   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
499   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
500   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
501   for (i=0; i<imax; i++) {
502     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
503     idx_i = idx[i];
504     len   = n[i];
505     for (j=0; j<len; j++) {
506       row = idx_i[j];
507       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
508       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
509       w4[proc]++;
510     }
511     for (j=0; j<size; j++) {
512       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
513     }
514   }
515 
516   nrqs     = 0;              /* no of outgoing messages */
517   msz      = 0;              /* total mesg length (for all proc */
518   w1[rank] = 0;              /* no mesg sent to intself */
519   w3[rank] = 0;
520   for (i=0; i<size; i++) {
521     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
522   }
523   /* pa - is list of processors to communicate with */
524   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
525   for (i=0,j=0; i<size; i++) {
526     if (w1[i]) {pa[j] = i; j++;}
527   }
528 
529   /* Each message would have a header = 1 + 2*(no of IS) + data */
530   for (i=0; i<nrqs; i++) {
531     j      = pa[i];
532     w1[j] += w2[j] + 2*w3[j];
533     msz   += w1[j];
534   }
535 
536   /* Determine the number of messages to expect, their lengths, from from-ids */
537   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
538   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
539 
540   /* Now post the Irecvs corresponding to these messages */
541   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
542 
543   /* Allocate Memory for outgoing messages */
544   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
545   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
546   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
547 
548   {
549     PetscInt *iptr = tmp,ict  = 0;
550     for (i=0; i<nrqs; i++) {
551       j         = pa[i];
552       iptr     +=  ict;
553       outdat[j] = iptr;
554       ict       = w1[j];
555     }
556   }
557 
558   /* Form the outgoing messages */
559   /* plug in the headers */
560   for (i=0; i<nrqs; i++) {
561     j            = pa[i];
562     outdat[j][0] = 0;
563     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
564     ptr[j]       = outdat[j] + 2*w3[j] + 1;
565   }
566 
567   /* Memory for doing local proc's work */
568   {
569     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
570 
571     for (i=0; i<imax; i++) {
572       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
573       data[i]  = d_p + M*i;
574     }
575   }
576 
577   /* Parse the IS and update local tables and the outgoing buf with the data */
578   {
579     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
580     PetscBT  table_i;
581 
582     for (i=0; i<imax; i++) {
583       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
584       n_i     = n[i];
585       table_i = table[i];
586       idx_i   = idx[i];
587       data_i  = data[i];
588       isz_i   = isz[i];
589       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
590         row  = idx_i[j];
591         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
592         if (proc != rank) { /* copy to the outgoing buffer */
593           ctr[proc]++;
594           *ptr[proc] = row;
595           ptr[proc]++;
596         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
597       }
598       /* Update the headers for the current IS */
599       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
600         if ((ctr_j = ctr[j])) {
601           outdat_j        = outdat[j];
602           k               = ++outdat_j[0];
603           outdat_j[2*k]   = ctr_j;
604           outdat_j[2*k-1] = i;
605         }
606       }
607       isz[i] = isz_i;
608     }
609   }
610 
611   /*  Now  post the sends */
612   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
613   for (i=0; i<nrqs; ++i) {
614     j    = pa[i];
615     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
616   }
617 
618   /* No longer need the original indices */
619   for (i=0; i<imax; ++i) {
620     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
621   }
622   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
623 
624   for (i=0; i<imax; ++i) {
625     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
626   }
627 
628   /* Do Local work */
629   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
630 
631   /* Receive messages */
632   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
633   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
634 
635   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
636   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
637 
638   /* Phase 1 sends are complete - deallocate buffers */
639   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
640   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
641 
642   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
643   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
644   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
645   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
646   ierr = PetscFree(rbuf);CHKERRQ(ierr);
647 
648 
649   /* Send the data back */
650   /* Do a global reduction to know the buffer space req for incoming messages */
651   {
652     PetscMPIInt *rw1;
653 
654     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
655 
656     for (i=0; i<nrqr; ++i) {
657       proc = recv_status[i].MPI_SOURCE;
658 
659       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
660       rw1[proc] = isz1[i];
661     }
662     ierr = PetscFree(onodes1);CHKERRQ(ierr);
663     ierr = PetscFree(olengths1);CHKERRQ(ierr);
664 
665     /* Determine the number of messages to expect, their lengths, from from-ids */
666     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
667     ierr = PetscFree(rw1);CHKERRQ(ierr);
668   }
669   /* Now post the Irecvs corresponding to these messages */
670   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
671 
672   /* Now  post the sends */
673   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
674   for (i=0; i<nrqr; ++i) {
675     j    = recv_status[i].MPI_SOURCE;
676     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
677   }
678 
679   /* receive work done on other processors */
680   {
681     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
682     PetscMPIInt idex;
683     PetscBT     table_i;
684     MPI_Status  *status2;
685 
686     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
687     for (i=0; i<nrqs; ++i) {
688       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
689       /* Process the message */
690       rbuf2_i = rbuf2[idex];
691       ct1     = 2*rbuf2_i[0]+1;
692       jmax    = rbuf2[idex][0];
693       for (j=1; j<=jmax; j++) {
694         max     = rbuf2_i[2*j];
695         is_no   = rbuf2_i[2*j-1];
696         isz_i   = isz[is_no];
697         data_i  = data[is_no];
698         table_i = table[is_no];
699         for (k=0; k<max; k++,ct1++) {
700           row = rbuf2_i[ct1];
701           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
702         }
703         isz[is_no] = isz_i;
704       }
705     }
706 
707     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
708     ierr = PetscFree(status2);CHKERRQ(ierr);
709   }
710 
711   for (i=0; i<imax; ++i) {
712     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
713   }
714 
715   ierr = PetscFree(onodes2);CHKERRQ(ierr);
716   ierr = PetscFree(olengths2);CHKERRQ(ierr);
717 
718   ierr = PetscFree(pa);CHKERRQ(ierr);
719   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
720   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
721   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
722   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
723   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
724   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
725   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
726   ierr = PetscFree(s_status);CHKERRQ(ierr);
727   ierr = PetscFree(recv_status);CHKERRQ(ierr);
728   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
729   ierr = PetscFree(xdata);CHKERRQ(ierr);
730   ierr = PetscFree(isz1);CHKERRQ(ierr);
731   PetscFunctionReturn(0);
732 }
733 
734 #undef __FUNCT__
735 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
736 /*
737    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
738        the work on the local processor.
739 
740      Inputs:
741       C      - MAT_MPIAIJ;
742       imax - total no of index sets processed at a time;
743       table  - an array of char - size = m bits.
744 
745      Output:
746       isz    - array containing the count of the solution elements corresponding
747                to each index set;
748       data   - pointer to the solutions
749 */
750 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
751 {
752   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
753   Mat        A  = c->A,B = c->B;
754   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
755   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
756   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
757   PetscBT    table_i;
758 
759   PetscFunctionBegin;
760   rstart = C->rmap->rstart;
761   cstart = C->cmap->rstart;
762   ai     = a->i;
763   aj     = a->j;
764   bi     = b->i;
765   bj     = b->j;
766   garray = c->garray;
767 
768 
769   for (i=0; i<imax; i++) {
770     data_i  = data[i];
771     table_i = table[i];
772     isz_i   = isz[i];
773     for (j=0,max=isz[i]; j<max; j++) {
774       row   = data_i[j] - rstart;
775       start = ai[row];
776       end   = ai[row+1];
777       for (k=start; k<end; k++) { /* Amat */
778         val = aj[k] + cstart;
779         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
780       }
781       start = bi[row];
782       end   = bi[row+1];
783       for (k=start; k<end; k++) { /* Bmat */
784         val = garray[bj[k]];
785         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
786       }
787     }
788     isz[i] = isz_i;
789   }
790   PetscFunctionReturn(0);
791 }
792 
793 #undef __FUNCT__
794 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
795 /*
796       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
797          and return the output
798 
799          Input:
800            C    - the matrix
801            nrqr - no of messages being processed.
802            rbuf - an array of pointers to the recieved requests
803 
804          Output:
805            xdata - array of messages to be sent back
806            isz1  - size of each message
807 
808   For better efficiency perhaps we should malloc separately each xdata[i],
809 then if a remalloc is required we need only copy the data for that one row
810 rather then all previous rows as it is now where a single large chunck of
811 memory is used.
812 
813 */
814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
815 {
816   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
817   Mat            A  = c->A,B = c->B;
818   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
819   PetscErrorCode ierr;
820   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
821   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
822   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
823   PetscInt       *rbuf_i,kmax,rbuf_0;
824   PetscBT        xtable;
825 
826   PetscFunctionBegin;
827   m      = C->rmap->N;
828   rstart = C->rmap->rstart;
829   cstart = C->cmap->rstart;
830   ai     = a->i;
831   aj     = a->j;
832   bi     = b->i;
833   bj     = b->j;
834   garray = c->garray;
835 
836 
837   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
838     rbuf_i =  rbuf[i];
839     rbuf_0 =  rbuf_i[0];
840     ct    += rbuf_0;
841     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
842   }
843 
844   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
845   else max1 = 1;
846   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
847   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
848   ++no_malloc;
849   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
850   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
851 
852   ct3 = 0;
853   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
854     rbuf_i =  rbuf[i];
855     rbuf_0 =  rbuf_i[0];
856     ct1    =  2*rbuf_0+1;
857     ct2    =  ct1;
858     ct3   += ct1;
859     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
860       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
861       oct2 = ct2;
862       kmax = rbuf_i[2*j];
863       for (k=0; k<kmax; k++,ct1++) {
864         row = rbuf_i[ct1];
865         if (!PetscBTLookupSet(xtable,row)) {
866           if (!(ct3 < mem_estimate)) {
867             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
868             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
869             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
870             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
871             xdata[0]     = tmp;
872             mem_estimate = new_estimate; ++no_malloc;
873             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
874           }
875           xdata[i][ct2++] = row;
876           ct3++;
877         }
878       }
879       for (k=oct2,max2=ct2; k<max2; k++) {
880         row   = xdata[i][k] - rstart;
881         start = ai[row];
882         end   = ai[row+1];
883         for (l=start; l<end; l++) {
884           val = aj[l] + cstart;
885           if (!PetscBTLookupSet(xtable,val)) {
886             if (!(ct3 < mem_estimate)) {
887               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
888               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
889               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
890               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
891               xdata[0]     = tmp;
892               mem_estimate = new_estimate; ++no_malloc;
893               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
894             }
895             xdata[i][ct2++] = val;
896             ct3++;
897           }
898         }
899         start = bi[row];
900         end   = bi[row+1];
901         for (l=start; l<end; l++) {
902           val = garray[bj[l]];
903           if (!PetscBTLookupSet(xtable,val)) {
904             if (!(ct3 < mem_estimate)) {
905               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
906               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
907               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
908               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
909               xdata[0]     = tmp;
910               mem_estimate = new_estimate; ++no_malloc;
911               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
912             }
913             xdata[i][ct2++] = val;
914             ct3++;
915           }
916         }
917       }
918       /* Update the header*/
919       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
920       xdata[i][2*j-1] = rbuf_i[2*j-1];
921     }
922     xdata[i][0] = rbuf_0;
923     xdata[i+1]  = xdata[i] + ct2;
924     isz1[i]     = ct2; /* size of each message */
925   }
926   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
927   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 /* -------------------------------------------------------------------------*/
931 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
932 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
933 /*
934     Every processor gets the entire matrix
935 */
936 #undef __FUNCT__
937 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
938 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
939 {
940   Mat            B;
941   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
942   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
943   PetscErrorCode ierr;
944   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
945   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
946   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
947   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
948 
949   PetscFunctionBegin;
950   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
951   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
952 
953   if (scall == MAT_INITIAL_MATRIX) {
954     /* ----------------------------------------------------------------
955          Tell every processor the number of nonzeros per row
956     */
957     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
958     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
959       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
960     }
961     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
962     for (i=0; i<size; i++) {
963       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
964       displs[i]     = A->rmap->range[i];
965     }
966 #if defined(PETSC_HAVE_MPI_IN_PLACE)
967     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
968 #else
969     sendcount = A->rmap->rend - A->rmap->rstart;
970     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
971 #endif
972     /* ---------------------------------------------------------------
973          Create the sequential matrix of the same type as the local block diagonal
974     */
975     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
976     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
977     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
978     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
979     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
980     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
981     **Bin = B;
982     b     = (Mat_SeqAIJ*)B->data;
983 
984     /*--------------------------------------------------------------------
985        Copy my part of matrix column indices over
986     */
987     sendcount  = ad->nz + bd->nz;
988     jsendbuf   = b->j + b->i[rstarts[rank]];
989     a_jsendbuf = ad->j;
990     b_jsendbuf = bd->j;
991     n          = A->rmap->rend - A->rmap->rstart;
992     cnt        = 0;
993     for (i=0; i<n; i++) {
994 
995       /* put in lower diagonal portion */
996       m = bd->i[i+1] - bd->i[i];
997       while (m > 0) {
998         /* is it above diagonal (in bd (compressed) numbering) */
999         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1000         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1001         m--;
1002       }
1003 
1004       /* put in diagonal portion */
1005       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1006         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1007       }
1008 
1009       /* put in upper diagonal portion */
1010       while (m-- > 0) {
1011         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1012       }
1013     }
1014     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1015 
1016     /*--------------------------------------------------------------------
1017        Gather all column indices to all processors
1018     */
1019     for (i=0; i<size; i++) {
1020       recvcounts[i] = 0;
1021       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1022         recvcounts[i] += lens[j];
1023       }
1024     }
1025     displs[0] = 0;
1026     for (i=1; i<size; i++) {
1027       displs[i] = displs[i-1] + recvcounts[i-1];
1028     }
1029 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1030     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1031 #else
1032     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1033 #endif
1034     /*--------------------------------------------------------------------
1035         Assemble the matrix into useable form (note numerical values not yet set)
1036     */
1037     /* set the b->ilen (length of each row) values */
1038     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1039     /* set the b->i indices */
1040     b->i[0] = 0;
1041     for (i=1; i<=A->rmap->N; i++) {
1042       b->i[i] = b->i[i-1] + lens[i-1];
1043     }
1044     ierr = PetscFree(lens);CHKERRQ(ierr);
1045     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1046     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1047 
1048   } else {
1049     B = **Bin;
1050     b = (Mat_SeqAIJ*)B->data;
1051   }
1052 
1053   /*--------------------------------------------------------------------
1054        Copy my part of matrix numerical values into the values location
1055   */
1056   if (flag == MAT_GET_VALUES) {
1057     sendcount = ad->nz + bd->nz;
1058     sendbuf   = b->a + b->i[rstarts[rank]];
1059     a_sendbuf = ad->a;
1060     b_sendbuf = bd->a;
1061     b_sendj   = bd->j;
1062     n         = A->rmap->rend - A->rmap->rstart;
1063     cnt       = 0;
1064     for (i=0; i<n; i++) {
1065 
1066       /* put in lower diagonal portion */
1067       m = bd->i[i+1] - bd->i[i];
1068       while (m > 0) {
1069         /* is it above diagonal (in bd (compressed) numbering) */
1070         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1071         sendbuf[cnt++] = *b_sendbuf++;
1072         m--;
1073         b_sendj++;
1074       }
1075 
1076       /* put in diagonal portion */
1077       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1078         sendbuf[cnt++] = *a_sendbuf++;
1079       }
1080 
1081       /* put in upper diagonal portion */
1082       while (m-- > 0) {
1083         sendbuf[cnt++] = *b_sendbuf++;
1084         b_sendj++;
1085       }
1086     }
1087     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1088 
1089     /* -----------------------------------------------------------------
1090        Gather all numerical values to all processors
1091     */
1092     if (!recvcounts) {
1093       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1094     }
1095     for (i=0; i<size; i++) {
1096       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1097     }
1098     displs[0] = 0;
1099     for (i=1; i<size; i++) {
1100       displs[i] = displs[i-1] + recvcounts[i-1];
1101     }
1102     recvbuf = b->a;
1103 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1104     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1105 #else
1106     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1107 #endif
1108   }  /* endof (flag == MAT_GET_VALUES) */
1109   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1110 
1111   if (A->symmetric) {
1112     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1113   } else if (A->hermitian) {
1114     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1115   } else if (A->structurally_symmetric) {
1116     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1117   }
1118   PetscFunctionReturn(0);
1119 }
1120 
1121 
1122 
1123 #undef __FUNCT__
1124 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
1125 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1126 {
1127   PetscErrorCode ierr;
1128   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1129   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1130 
1131   PetscFunctionBegin;
1132 
1133   /*
1134        Check for special case: each processor gets entire matrix
1135   */
1136   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1137     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1138     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1139     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1140     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1141     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1142       wantallmatrix = PETSC_TRUE;
1143 
1144       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1145     }
1146   }
1147   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1148   if (twantallmatrix) {
1149     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
1150     PetscFunctionReturn(0);
1151   }
1152 
1153   /* Allocate memory to hold all the submatrices */
1154   if (scall != MAT_REUSE_MATRIX) {
1155     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
1156   }
1157 
1158   /* Check for special case: each processor gets entire matrix columns */
1159   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
1160   for (i=0; i<ismax; i++) {
1161     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
1162     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
1163     if (colflag && ncol == C->cmap->N) {
1164       allcolumns[i] = PETSC_TRUE;
1165     } else {
1166       allcolumns[i] = PETSC_FALSE;
1167     }
1168   }
1169 
1170   /* Determine the number of stages through which submatrices are done */
1171   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1172 
1173   /*
1174      Each stage will extract nmax submatrices.
1175      nmax is determined by the matrix column dimension.
1176      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1177   */
1178   if (!nmax) nmax = 1;
1179   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1180 
1181   /* Make sure every processor loops through the nstages */
1182   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1183 
1184   for (i=0,pos=0; i<nstages; i++) {
1185     if (pos+nmax <= ismax) max_no = nmax;
1186     else if (pos == ismax) max_no = 0;
1187     else                   max_no = ismax-pos;
1188     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
1189     pos += max_no;
1190   }
1191 
1192   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 /* -------------------------------------------------------------------------*/
1197 #undef __FUNCT__
1198 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
1199 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1200 {
1201   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1202   Mat            A  = c->A;
1203   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1204   const PetscInt **icol,**irow;
1205   PetscInt       *nrow,*ncol,start;
1206   PetscErrorCode ierr;
1207   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1208   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1209   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1210   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1211   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1212 #if defined(PETSC_USE_CTABLE)
1213   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1214 #else
1215   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1216 #endif
1217   const PetscInt *irow_i;
1218   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1219   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1220   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1221   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1222   MPI_Status     *r_status3,*r_status4,*s_status4;
1223   MPI_Comm       comm;
1224   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1225   PetscMPIInt    *onodes1,*olengths1;
1226   PetscMPIInt    idex,idex2,end;
1227 
1228   PetscFunctionBegin;
1229   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1230   tag0 = ((PetscObject)C)->tag;
1231   size = c->size;
1232   rank = c->rank;
1233 
1234   /* Get some new tags to keep the communication clean */
1235   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1236   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1237   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1238 
1239   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
1240 
1241   for (i=0; i<ismax; i++) {
1242     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
1243     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
1244     if (allcolumns[i]) {
1245       icol[i] = NULL;
1246       ncol[i] = C->cmap->N;
1247     } else {
1248       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
1249       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
1250     }
1251   }
1252 
1253   /* evaluate communication - mesg to who, length of mesg, and buffer space
1254      required. Based on this, buffers are allocated, and data copied into them*/
1255   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
1256   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1257   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1258   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1259   for (i=0; i<ismax; i++) {
1260     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
1261     jmax   = nrow[i];
1262     irow_i = irow[i];
1263     for (j=0; j<jmax; j++) {
1264       l   = 0;
1265       row = irow_i[j];
1266       while (row >= C->rmap->range[l+1]) l++;
1267       proc = l;
1268       w4[proc]++;
1269     }
1270     for (j=0; j<size; j++) {
1271       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1272     }
1273   }
1274 
1275   nrqs     = 0;              /* no of outgoing messages */
1276   msz      = 0;              /* total mesg length (for all procs) */
1277   w1[rank] = 0;              /* no mesg sent to self */
1278   w3[rank] = 0;
1279   for (i=0; i<size; i++) {
1280     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1281   }
1282   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1283   for (i=0,j=0; i<size; i++) {
1284     if (w1[i]) { pa[j] = i; j++; }
1285   }
1286 
1287   /* Each message would have a header = 1 + 2*(no of IS) + data */
1288   for (i=0; i<nrqs; i++) {
1289     j      = pa[i];
1290     w1[j] += w2[j] + 2* w3[j];
1291     msz   += w1[j];
1292   }
1293   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1294 
1295   /* Determine the number of messages to expect, their lengths, from from-ids */
1296   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1297   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1298 
1299   /* Now post the Irecvs corresponding to these messages */
1300   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1301 
1302   ierr = PetscFree(onodes1);CHKERRQ(ierr);
1303   ierr = PetscFree(olengths1);CHKERRQ(ierr);
1304 
1305   /* Allocate Memory for outgoing messages */
1306   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1307   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1308   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1309 
1310   {
1311     PetscInt *iptr = tmp,ict = 0;
1312     for (i=0; i<nrqs; i++) {
1313       j        = pa[i];
1314       iptr    += ict;
1315       sbuf1[j] = iptr;
1316       ict      = w1[j];
1317     }
1318   }
1319 
1320   /* Form the outgoing messages */
1321   /* Initialize the header space */
1322   for (i=0; i<nrqs; i++) {
1323     j           = pa[i];
1324     sbuf1[j][0] = 0;
1325     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
1326     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1327   }
1328 
1329   /* Parse the isrow and copy data into outbuf */
1330   for (i=0; i<ismax; i++) {
1331     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1332     irow_i = irow[i];
1333     jmax   = nrow[i];
1334     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1335       l   = 0;
1336       row = irow_i[j];
1337       while (row >= C->rmap->range[l+1]) l++;
1338       proc = l;
1339       if (proc != rank) { /* copy to the outgoing buf*/
1340         ctr[proc]++;
1341         *ptr[proc] = row;
1342         ptr[proc]++;
1343       }
1344     }
1345     /* Update the headers for the current IS */
1346     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1347       if ((ctr_j = ctr[j])) {
1348         sbuf1_j        = sbuf1[j];
1349         k              = ++sbuf1_j[0];
1350         sbuf1_j[2*k]   = ctr_j;
1351         sbuf1_j[2*k-1] = i;
1352       }
1353     }
1354   }
1355 
1356   /*  Now  post the sends */
1357   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1358   for (i=0; i<nrqs; ++i) {
1359     j    = pa[i];
1360     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
1361   }
1362 
1363   /* Post Receives to capture the buffer size */
1364   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
1365   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
1366   rbuf2[0] = tmp + msz;
1367   for (i=1; i<nrqs; ++i) {
1368     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1369   }
1370   for (i=0; i<nrqs; ++i) {
1371     j    = pa[i];
1372     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
1373   }
1374 
1375   /* Send to other procs the buf size they should allocate */
1376 
1377 
1378   /* Receive messages*/
1379   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
1380   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1381   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
1382   {
1383     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1384     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1385     PetscInt   *sbuf2_i;
1386 
1387     for (i=0; i<nrqr; ++i) {
1388       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1389 
1390       req_size[idex] = 0;
1391       rbuf1_i        = rbuf1[idex];
1392       start          = 2*rbuf1_i[0] + 1;
1393       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1394       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
1395       sbuf2_i        = sbuf2[idex];
1396       for (j=start; j<end; j++) {
1397         id              = rbuf1_i[j] - rstart;
1398         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1399         sbuf2_i[j]      = ncols;
1400         req_size[idex] += ncols;
1401       }
1402       req_source[idex] = r_status1[i].MPI_SOURCE;
1403       /* form the header */
1404       sbuf2_i[0] = req_size[idex];
1405       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1406 
1407       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1408     }
1409   }
1410   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1411   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1412 
1413   /*  recv buffer sizes */
1414   /* Receive messages*/
1415 
1416   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1417   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1418   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1419   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1420   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1421 
1422   for (i=0; i<nrqs; ++i) {
1423     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1424     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1425     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1426     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1427     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1428   }
1429   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1430   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1431 
1432   /* Wait on sends1 and sends2 */
1433   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1434   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1435 
1436   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1437   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1438   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1439   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1440   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1441   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1442 
1443   /* Now allocate buffers for a->j, and send them off */
1444   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1445   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1446   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1447   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1448 
1449   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1450   {
1451     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1452     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1453     PetscInt cend = C->cmap->rend;
1454     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1455 
1456     for (i=0; i<nrqr; i++) {
1457       rbuf1_i   = rbuf1[i];
1458       sbuf_aj_i = sbuf_aj[i];
1459       ct1       = 2*rbuf1_i[0] + 1;
1460       ct2       = 0;
1461       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1462         kmax = rbuf1[i][2*j];
1463         for (k=0; k<kmax; k++,ct1++) {
1464           row    = rbuf1_i[ct1] - rstart;
1465           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1466           ncols  = nzA + nzB;
1467           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1468 
1469           /* load the column indices for this row into cols*/
1470           cols = sbuf_aj_i + ct2;
1471 
1472           lwrite = 0;
1473           for (l=0; l<nzB; l++) {
1474             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1475           }
1476           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1477           for (l=0; l<nzB; l++) {
1478             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1479           }
1480 
1481           ct2 += ncols;
1482         }
1483       }
1484       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1485     }
1486   }
1487   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1488   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1489 
1490   /* Allocate buffers for a->a, and send them off */
1491   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1492   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1493   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1494   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1495 
1496   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1497   {
1498     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1499     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1500     PetscInt    cend   = C->cmap->rend;
1501     PetscInt    *b_j   = b->j;
1502     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1503 
1504     for (i=0; i<nrqr; i++) {
1505       rbuf1_i   = rbuf1[i];
1506       sbuf_aa_i = sbuf_aa[i];
1507       ct1       = 2*rbuf1_i[0]+1;
1508       ct2       = 0;
1509       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1510         kmax = rbuf1_i[2*j];
1511         for (k=0; k<kmax; k++,ct1++) {
1512           row    = rbuf1_i[ct1] - rstart;
1513           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1514           ncols  = nzA + nzB;
1515           cworkB = b_j + b_i[row];
1516           vworkA = a_a + a_i[row];
1517           vworkB = b_a + b_i[row];
1518 
1519           /* load the column values for this row into vals*/
1520           vals = sbuf_aa_i+ct2;
1521 
1522           lwrite = 0;
1523           for (l=0; l<nzB; l++) {
1524             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1525           }
1526           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1527           for (l=0; l<nzB; l++) {
1528             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1529           }
1530 
1531           ct2 += ncols;
1532         }
1533       }
1534       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1535     }
1536   }
1537   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1538   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1539   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1540   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1541 
1542   /* Form the matrix */
1543   /* create col map: global col of C -> local col of submatrices */
1544   {
1545     const PetscInt *icol_i;
1546 #if defined(PETSC_USE_CTABLE)
1547     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1548     for (i=0; i<ismax; i++) {
1549       if (!allcolumns[i]) {
1550         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1551 
1552         jmax   = ncol[i];
1553         icol_i = icol[i];
1554         cmap_i = cmap[i];
1555         for (j=0; j<jmax; j++) {
1556           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1557         }
1558       } else {
1559         cmap[i] = NULL;
1560       }
1561     }
1562 #else
1563     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1564     for (i=0; i<ismax; i++) {
1565       if (!allcolumns[i]) {
1566         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1567         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1568         jmax   = ncol[i];
1569         icol_i = icol[i];
1570         cmap_i = cmap[i];
1571         for (j=0; j<jmax; j++) {
1572           cmap_i[icol_i[j]] = j+1;
1573         }
1574       } else {
1575         cmap[i] = NULL;
1576       }
1577     }
1578 #endif
1579   }
1580 
1581   /* Create lens which is required for MatCreate... */
1582   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1583   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1584   if (ismax) {
1585     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1586     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1587   }
1588   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1589 
1590   /* Update lens from local data */
1591   for (i=0; i<ismax; i++) {
1592     jmax = nrow[i];
1593     if (!allcolumns[i]) cmap_i = cmap[i];
1594     irow_i = irow[i];
1595     lens_i = lens[i];
1596     for (j=0; j<jmax; j++) {
1597       l   = 0;
1598       row = irow_i[j];
1599       while (row >= C->rmap->range[l+1]) l++;
1600       proc = l;
1601       if (proc == rank) {
1602         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1603         if (!allcolumns[i]) {
1604           for (k=0; k<ncols; k++) {
1605 #if defined(PETSC_USE_CTABLE)
1606             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1607 #else
1608             tcol = cmap_i[cols[k]];
1609 #endif
1610             if (tcol) lens_i[j]++;
1611           }
1612         } else { /* allcolumns */
1613           lens_i[j] = ncols;
1614         }
1615         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1616       }
1617     }
1618   }
1619 
1620   /* Create row map: global row of C -> local row of submatrices */
1621 #if defined(PETSC_USE_CTABLE)
1622   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1623   for (i=0; i<ismax; i++) {
1624     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1625     irow_i = irow[i];
1626     jmax   = nrow[i];
1627     for (j=0; j<jmax; j++) {
1628       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1629     }
1630   }
1631 #else
1632   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1633   if (ismax) {
1634     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1635     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1636   }
1637   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1638   for (i=0; i<ismax; i++) {
1639     rmap_i = rmap[i];
1640     irow_i = irow[i];
1641     jmax   = nrow[i];
1642     for (j=0; j<jmax; j++) {
1643       rmap_i[irow_i[j]] = j;
1644     }
1645   }
1646 #endif
1647 
1648   /* Update lens from offproc data */
1649   {
1650     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1651 
1652     for (tmp2=0; tmp2<nrqs; tmp2++) {
1653       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1654       idex    = pa[idex2];
1655       sbuf1_i = sbuf1[idex];
1656       jmax    = sbuf1_i[0];
1657       ct1     = 2*jmax+1;
1658       ct2     = 0;
1659       rbuf2_i = rbuf2[idex2];
1660       rbuf3_i = rbuf3[idex2];
1661       for (j=1; j<=jmax; j++) {
1662         is_no  = sbuf1_i[2*j-1];
1663         max1   = sbuf1_i[2*j];
1664         lens_i = lens[is_no];
1665         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1666         rmap_i = rmap[is_no];
1667         for (k=0; k<max1; k++,ct1++) {
1668 #if defined(PETSC_USE_CTABLE)
1669           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1670           row--;
1671           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1672 #else
1673           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1674 #endif
1675           max2 = rbuf2_i[ct1];
1676           for (l=0; l<max2; l++,ct2++) {
1677             if (!allcolumns[is_no]) {
1678 #if defined(PETSC_USE_CTABLE)
1679               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1680 #else
1681               tcol = cmap_i[rbuf3_i[ct2]];
1682 #endif
1683               if (tcol) lens_i[row]++;
1684             } else { /* allcolumns */
1685               lens_i[row]++; /* lens_i[row] += max2 ? */
1686             }
1687           }
1688         }
1689       }
1690     }
1691   }
1692   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1693   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1694   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1695   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1696   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1697 
1698   /* Create the submatrices */
1699   if (scall == MAT_REUSE_MATRIX) {
1700     PetscBool flag;
1701 
1702     /*
1703         Assumes new rows are same length as the old rows,hence bug!
1704     */
1705     for (i=0; i<ismax; i++) {
1706       mat = (Mat_SeqAIJ*)(submats[i]->data);
1707       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1708       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1709       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1710       /* Initial matrix as if empty */
1711       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1712 
1713       submats[i]->factortype = C->factortype;
1714     }
1715   } else {
1716     for (i=0; i<ismax; i++) {
1717       PetscInt rbs,cbs;
1718       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1719       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1720 
1721       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1722       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1723 
1724       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1725       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1726       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1727     }
1728   }
1729 
1730   /* Assemble the matrices */
1731   /* First assemble the local rows */
1732   {
1733     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1734     PetscScalar *imat_a;
1735 
1736     for (i=0; i<ismax; i++) {
1737       mat       = (Mat_SeqAIJ*)submats[i]->data;
1738       imat_ilen = mat->ilen;
1739       imat_j    = mat->j;
1740       imat_i    = mat->i;
1741       imat_a    = mat->a;
1742 
1743       if (!allcolumns[i]) cmap_i = cmap[i];
1744       rmap_i = rmap[i];
1745       irow_i = irow[i];
1746       jmax   = nrow[i];
1747       for (j=0; j<jmax; j++) {
1748         l   = 0;
1749         row = irow_i[j];
1750         while (row >= C->rmap->range[l+1]) l++;
1751         proc = l;
1752         if (proc == rank) {
1753           old_row = row;
1754 #if defined(PETSC_USE_CTABLE)
1755           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1756           row--;
1757 #else
1758           row = rmap_i[row];
1759 #endif
1760           ilen_row = imat_ilen[row];
1761           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1762           mat_i    = imat_i[row];
1763           mat_a    = imat_a + mat_i;
1764           mat_j    = imat_j + mat_i;
1765           if (!allcolumns[i]) {
1766             for (k=0; k<ncols; k++) {
1767 #if defined(PETSC_USE_CTABLE)
1768               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1769 #else
1770               tcol = cmap_i[cols[k]];
1771 #endif
1772               if (tcol) {
1773                 *mat_j++ = tcol - 1;
1774                 *mat_a++ = vals[k];
1775                 ilen_row++;
1776               }
1777             }
1778           } else { /* allcolumns */
1779             for (k=0; k<ncols; k++) {
1780               *mat_j++ = cols[k];  /* global col index! */
1781               *mat_a++ = vals[k];
1782               ilen_row++;
1783             }
1784           }
1785           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1786 
1787           imat_ilen[row] = ilen_row;
1788         }
1789       }
1790     }
1791   }
1792 
1793   /*   Now assemble the off proc rows*/
1794   {
1795     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1796     PetscInt    *imat_j,*imat_i;
1797     PetscScalar *imat_a,*rbuf4_i;
1798 
1799     for (tmp2=0; tmp2<nrqs; tmp2++) {
1800       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1801       idex    = pa[idex2];
1802       sbuf1_i = sbuf1[idex];
1803       jmax    = sbuf1_i[0];
1804       ct1     = 2*jmax + 1;
1805       ct2     = 0;
1806       rbuf2_i = rbuf2[idex2];
1807       rbuf3_i = rbuf3[idex2];
1808       rbuf4_i = rbuf4[idex2];
1809       for (j=1; j<=jmax; j++) {
1810         is_no     = sbuf1_i[2*j-1];
1811         rmap_i    = rmap[is_no];
1812         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1813         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1814         imat_ilen = mat->ilen;
1815         imat_j    = mat->j;
1816         imat_i    = mat->i;
1817         imat_a    = mat->a;
1818         max1      = sbuf1_i[2*j];
1819         for (k=0; k<max1; k++,ct1++) {
1820           row = sbuf1_i[ct1];
1821 #if defined(PETSC_USE_CTABLE)
1822           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1823           row--;
1824 #else
1825           row = rmap_i[row];
1826 #endif
1827           ilen  = imat_ilen[row];
1828           mat_i = imat_i[row];
1829           mat_a = imat_a + mat_i;
1830           mat_j = imat_j + mat_i;
1831           max2  = rbuf2_i[ct1];
1832           if (!allcolumns[is_no]) {
1833             for (l=0; l<max2; l++,ct2++) {
1834 
1835 #if defined(PETSC_USE_CTABLE)
1836               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1837 #else
1838               tcol = cmap_i[rbuf3_i[ct2]];
1839 #endif
1840               if (tcol) {
1841                 *mat_j++ = tcol - 1;
1842                 *mat_a++ = rbuf4_i[ct2];
1843                 ilen++;
1844               }
1845             }
1846           } else { /* allcolumns */
1847             for (l=0; l<max2; l++,ct2++) {
1848               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1849               *mat_a++ = rbuf4_i[ct2];
1850               ilen++;
1851             }
1852           }
1853           imat_ilen[row] = ilen;
1854         }
1855       }
1856     }
1857   }
1858 
1859   /* sort the rows */
1860   {
1861     PetscInt    *imat_ilen,*imat_j,*imat_i;
1862     PetscScalar *imat_a;
1863 
1864     for (i=0; i<ismax; i++) {
1865       mat       = (Mat_SeqAIJ*)submats[i]->data;
1866       imat_j    = mat->j;
1867       imat_i    = mat->i;
1868       imat_a    = mat->a;
1869       imat_ilen = mat->ilen;
1870 
1871       if (allcolumns[i]) continue;
1872       jmax = nrow[i];
1873       for (j=0; j<jmax; j++) {
1874         PetscInt ilen;
1875 
1876         mat_i = imat_i[j];
1877         mat_a = imat_a + mat_i;
1878         mat_j = imat_j + mat_i;
1879         ilen  = imat_ilen[j];
1880         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
1881       }
1882     }
1883   }
1884 
1885   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1886   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1887   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1888   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1889   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1890 
1891   /* Restore the indices */
1892   for (i=0; i<ismax; i++) {
1893     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1894     if (!allcolumns[i]) {
1895       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1896     }
1897   }
1898 
1899   /* Destroy allocated memory */
1900   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1901   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1902   ierr = PetscFree(pa);CHKERRQ(ierr);
1903 
1904   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1905   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1906   for (i=0; i<nrqr; ++i) {
1907     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1908   }
1909   for (i=0; i<nrqs; ++i) {
1910     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1911     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1912   }
1913 
1914   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1915   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1916   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1917   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1918   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1919   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1920   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1921 
1922 #if defined(PETSC_USE_CTABLE)
1923   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1924 #else
1925   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1926 #endif
1927   ierr = PetscFree(rmap);CHKERRQ(ierr);
1928 
1929   for (i=0; i<ismax; i++) {
1930     if (!allcolumns[i]) {
1931 #if defined(PETSC_USE_CTABLE)
1932       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1933 #else
1934       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1935 #endif
1936     }
1937   }
1938   ierr = PetscFree(cmap);CHKERRQ(ierr);
1939   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1940   ierr = PetscFree(lens);CHKERRQ(ierr);
1941 
1942   for (i=0; i<ismax; i++) {
1943     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1944     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1945   }
1946   PetscFunctionReturn(0);
1947 }
1948 
1949 /*
1950  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
1951  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
1952  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
1953  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
1954  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
1955  state, and needs to be "assembled" later by compressing B's column space.
1956 
1957  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
1958  Following this call, C->A & C->B have been created, even if empty.
1959  */
1960 #undef __FUNCT__
1961 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
1962 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
1963 {
1964   /* If making this function public, change the error returned in this function away from _PLIB. */
1965   PetscErrorCode ierr;
1966   Mat_MPIAIJ     *aij;
1967   Mat_SeqAIJ     *Baij;
1968   PetscBool      seqaij,Bdisassembled;
1969   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
1970   PetscScalar    v;
1971   const PetscInt *rowindices,*colindices;
1972 
1973   PetscFunctionBegin;
1974   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
1975   if (A) {
1976     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
1977     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
1978     if (rowemb) {
1979       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
1980       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
1981     } else {
1982       if (C->rmap->n != A->rmap->n) {
1983 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
1984       }
1985     }
1986     if (dcolemb) {
1987       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
1988       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
1989     } else {
1990       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
1991     }
1992   }
1993   if (B) {
1994     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
1995     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
1996     if (rowemb) {
1997       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
1998       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
1999     } else {
2000       if (C->rmap->n != B->rmap->n) {
2001 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2002       }
2003     }
2004     if (ocolemb) {
2005       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2006       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2007     } else {
2008       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2009     }
2010   }
2011 
2012   aij    = (Mat_MPIAIJ*)(C->data);
2013   if (!aij->A) {
2014     /* Mimic parts of MatMPIAIJSetPreallocation() */
2015     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2016     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2017     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2018     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2019     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2020   }
2021   if (A) {
2022     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2023   } else {
2024     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2025   }
2026   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2027     /*
2028       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2029       need to "disassemble" B -- convert it to using C's global indices.
2030       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2031 
2032       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2033       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2034 
2035       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2036       At least avoid calling MatSetValues() and the implied searches?
2037     */
2038 
2039     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2040 #if defined(PETSC_USE_CTABLE)
2041       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2042 #else
2043       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2044       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2045       if (aij->B) {
2046         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2047       }
2048 #endif
2049       ngcol = 0;
2050       if (aij->lvec) {
2051 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2052       }
2053       if (aij->garray) {
2054 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2055 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2056       }
2057       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2058       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2059     }
2060     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2061       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2062     }
2063     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2064       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2065     }
2066   }
2067   Bdisassembled = PETSC_FALSE;
2068   if (!aij->B) {
2069     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2070     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2071     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2072     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2073     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2074     Bdisassembled = PETSC_TRUE;
2075   }
2076   if (B) {
2077     Baij = (Mat_SeqAIJ*)(B->data);
2078     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2079       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2080       for (i=0; i<B->rmap->n; i++) {
2081 	nz[i] = Baij->i[i+1] - Baij->i[i];
2082       }
2083       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2084       ierr = PetscFree(nz);CHKERRQ(ierr);
2085     }
2086 
2087     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
2088     shift = rend-rstart;
2089     count = 0;
2090     rowindices = NULL;
2091     colindices = NULL;
2092     if (rowemb) {
2093       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
2094     }
2095     if (ocolemb) {
2096       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
2097     }
2098     for (i=0; i<B->rmap->n; i++) {
2099       PetscInt row;
2100       row = i;
2101       if (rowindices) row = rowindices[i];
2102       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2103 	col  = Baij->j[count];
2104 	if (colindices) col = colindices[col];
2105 	if (Bdisassembled && col>=rstart) col += shift;
2106 	v    = Baij->a[count];
2107 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
2108 	++count;
2109       }
2110     }
2111     /* No assembly for aij->B is necessary. */
2112     /* FIXME: set aij->B's nonzerostate correctly. */
2113   } else {
2114     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
2115   }
2116   C->preallocated  = PETSC_TRUE;
2117   C->was_assembled = PETSC_FALSE;
2118   C->assembled     = PETSC_FALSE;
2119    /*
2120       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2121       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2122    */
2123   PetscFunctionReturn(0);
2124 }
2125 
2126 #undef __FUNCT__
2127 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
2128 /*
2129   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2130  */
2131 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
2132 {
2133   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
2134 
2135   PetscFunctionBegin;
2136   PetscValidPointer(A,2);
2137   PetscValidPointer(B,3);
2138   /* FIXME: make sure C is assembled */
2139   *A = aij->A;
2140   *B = aij->B;
2141   /* Note that we don't incref *A and *B, so be careful! */
2142   PetscFunctionReturn(0);
2143 }
2144 
2145 /*
2146   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2147   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2148 */
2149 #undef __FUNCT__
2150 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
2151 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
2152                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
2153 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
2154 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
2155 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
2156 {
2157   PetscErrorCode ierr;
2158   PetscMPIInt    isize,flag;
2159   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
2160   Mat            *A,*B;
2161   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
2162 
2163   PetscFunctionBegin;
2164   if (!ismax) PetscFunctionReturn(0);
2165 
2166   for (i = 0, cismax = 0; i < ismax; ++i) {
2167     PetscMPIInt isize;
2168     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
2169     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2170     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
2171     if (isize > 1) ++cismax;
2172   }
2173   /*
2174      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
2175      ispar counts the number of parallel ISs across C's comm.
2176   */
2177   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2178   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
2179     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2180     PetscFunctionReturn(0);
2181   }
2182 
2183   /* if (ispar) */
2184   /*
2185     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
2186     These are used to extract the off-diag portion of the resulting parallel matrix.
2187     The row IS for the off-diag portion is the same as for the diag portion,
2188     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
2189   */
2190   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
2191   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
2192   for (i = 0, ii = 0; i < ismax; ++i) {
2193     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2194     if (isize > 1) {
2195       /*
2196 	 TODO: This is the part that's ***NOT SCALABLE***.
2197 	 To fix this we need to extract just the indices of C's nonzero columns
2198 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
2199 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
2200 	 to be done without serializing on the IS list, so, most likely, it is best
2201 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
2202       */
2203       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
2204       /* Now we have to
2205 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
2206 	     were sorted on each rank, concatenated they might no longer be sorted;
2207 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
2208 	     indices in the nondecreasing order to the original index positions.
2209 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
2210       */
2211       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
2212       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
2213       ++ii;
2214     }
2215   }
2216   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
2217   for (i = 0, ii = 0; i < ismax; ++i) {
2218     PetscInt       j,issize;
2219     const PetscInt *indices;
2220 
2221     /*
2222        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
2223      */
2224     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
2225     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
2226     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
2227     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
2228     for (j = 1; j < issize; ++j) {
2229       if (indices[j] == indices[j-1]) {
2230 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2231       }
2232     }
2233     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
2234 
2235 
2236     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
2237     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
2238     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
2239     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
2240     for (j = 1; j < issize; ++j) {
2241       if (indices[j-1] == indices[j]) {
2242 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2243       }
2244     }
2245     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
2246     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2247     if (isize > 1) {
2248       cisrow[ii] = isrow[i];
2249       ++ii;
2250     }
2251   }
2252   /*
2253     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2254     array of sequential matrices underlying the resulting parallel matrices.
2255     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
2256     contain duplicates.
2257 
2258     There are as many diag matrices as there are original index sets. There are only as many parallel
2259     and off-diag matrices, as there are parallel (comm size > 1) index sets.
2260 
2261     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
2262     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
2263       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
2264       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
2265       with A[i] and B[ii] extracted from the corresponding MPI submat.
2266     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
2267       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
2268       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
2269       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
2270       values into A[i] and B[ii] sitting inside the corresponding submat.
2271     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
2272       A[i], B[ii], AA[i] or BB[ii] matrices.
2273   */
2274   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
2275   if (scall == MAT_INITIAL_MATRIX) {
2276     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
2277     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
2278   } else {
2279     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
2280     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
2281     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
2282     for (i = 0, ii = 0; i < ismax; ++i) {
2283       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2284       if (isize > 1) {
2285 	Mat AA,BB;
2286 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
2287 	if (!isrow_p[i] && !iscol_p[i]) {
2288 	  A[i] = AA;
2289 	} else {
2290 	  /* TODO: extract A[i] composed on AA. */
2291 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
2292 	}
2293 	if (!isrow_p[i] && !ciscol_p[ii]) {
2294 	  B[ii] = BB;
2295 	} else {
2296 	  /* TODO: extract B[ii] composed on BB. */
2297 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
2298 	}
2299 	++ii;
2300       } else {
2301 	if (!isrow_p[i] && !iscol_p[i]) {
2302 	  A[i] = (*submat)[i];
2303 	} else {
2304 	  /* TODO: extract A[i] composed on (*submat)[i]. */
2305 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
2306 	}
2307       }
2308     }
2309   }
2310   /* Now obtain the sequential A and B submatrices separately. */
2311   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
2312   /* I did not figure out a good way to fix it right now.
2313    * Local column size of B[i] is different from the size of ciscol[i].
2314    * B[i]'s size is finally determined by MatAssembly, while
2315    * ciscol[i] is computed as the complement of iscol[i].
2316    * It is better to keep only nonzero indices when computing
2317    * the complement ciscol[i].
2318    * */
2319   if(scall==MAT_REUSE_MATRIX){
2320 	for(i=0; i<cismax; i++){
2321 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
2322 	  B[i]->cmap->n = ciscol_localsize;
2323 	}
2324   }
2325   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
2326   /*
2327     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
2328     matrices A & B have been extracted directly into the parallel matrices containing them, or
2329     simply into the sequential matrix identical with the corresponding A (if isize == 1).
2330     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
2331     to have the same sparsity pattern.
2332     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
2333     must be constructed for C. This is done by setseqmat(s).
2334   */
2335   for (i = 0, ii = 0; i < ismax; ++i) {
2336     /*
2337        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
2338        That way we can avoid sorting and computing permutations when reusing.
2339        To this end:
2340         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
2341 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
2342 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
2343     */
2344     MatStructure pattern;
2345     if (scall == MAT_INITIAL_MATRIX) {
2346       pattern = DIFFERENT_NONZERO_PATTERN;
2347     } else {
2348       pattern = SAME_NONZERO_PATTERN;
2349     }
2350     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2351     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
2352     if (isize > 1) {
2353       if (scall == MAT_INITIAL_MATRIX) {
2354 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
2355 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2356 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
2357 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
2358 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
2359       }
2360       /*
2361 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
2362       */
2363       {
2364 	Mat AA,BB;
2365 	AA = NULL;
2366 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
2367 	BB = NULL;
2368 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
2369 	if (AA || BB) {
2370 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
2371 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2372 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2373 	}
2374 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
2375 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
2376 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
2377 	}
2378 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
2379 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
2380 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
2381 	}
2382       }
2383       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
2384       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
2385       ++ii;
2386     } else { /* if (isize == 1) */
2387       if (scall == MAT_INITIAL_MATRIX) {
2388 	if (isrow_p[i] || iscol_p[i]) {
2389 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
2390 	} else (*submat)[i] = A[i];
2391       }
2392       if (isrow_p[i] || iscol_p[i]) {
2393 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
2394 	/* Otherwise A is extracted straight into (*submats)[i]. */
2395 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
2396 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
2397       }
2398     }
2399     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
2400     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
2401   }
2402   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
2403   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
2404   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
2405   ierr = PetscFree(A);CHKERRQ(ierr);
2406   ierr = PetscFree(B);CHKERRQ(ierr);
2407   PetscFunctionReturn(0);
2408 }
2409 
2410 
2411 
2412 #undef __FUNCT__
2413 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
2414 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2415 {
2416   PetscErrorCode ierr;
2417 
2418   PetscFunctionBegin;
2419   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
2420   PetscFunctionReturn(0);
2421 }
2422