xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 6b3eee04a67a44bd9a4e65b41b1dd96ffcd06ca3) !
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 #include <../src/mat/impls/aij/mpi/mpiaij.h>
8 #include <petscbt.h>
9 #include <petscsf.h>
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
20 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
21 
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
25 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
26 {
27   PetscErrorCode ierr;
28   PetscInt       i;
29 
30   PetscFunctionBegin;
31   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
32   for (i=0; i<ov; ++i) {
33     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
34   }
35   PetscFunctionReturn(0);
36 }
37 
38 #undef __FUNCT__
39 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
40 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
41 {
42   PetscErrorCode ierr;
43   PetscInt       i;
44 
45   PetscFunctionBegin;
46   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
47   for (i=0; i<ov; ++i) {
48     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
49   }
50   PetscFunctionReturn(0);
51 }
52 
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
57 {
58   PetscErrorCode   ierr;
59   MPI_Comm         comm;
60   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
61   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
62   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
63   const PetscInt  *indices_i,**indices;
64   PetscLayout      rmap;
65   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
66   PetscSF          sf;
67   PetscSFNode     *remote;
68 
69   PetscFunctionBegin;
70   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
71   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
72   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
73   /* get row map to determine where rows should be going */
74   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
75   /* retrieve IS data and put all together so that we
76    * can optimize communication
77    *  */
78   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
79   for (i=0,tlength=0; i<nidx; i++){
80     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
81     tlength += length[i];
82     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
83   }
84   /* find these rows on remote processors */
85   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
86   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
87   nrrows = 0;
88   for (i=0; i<nidx; i++){
89     length_i     = length[i];
90     indices_i    = indices[i];
91     for (j=0; j<length_i; j++){
92       owner = -1;
93       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
94       /* remote processors */
95       if (owner != rank){
96         tosizes_temp[owner]++; /* number of rows to owner */
97         rrow_ranks[nrrows]  = owner; /* processor */
98         rrow_isids[nrrows]   = i; /* is id */
99         remoterows[nrrows++] = indices_i[j]; /* row */
100       }
101     }
102     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
103   }
104   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
105   /* test if we need to exchange messages
106    * generally speaking, we do not need to exchange
107    * data when overlap is 1
108    * */
109   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
110   /* we do not have any messages
111    * It usually corresponds to overlap 1
112    * */
113   if (!reducednrrows){
114     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
115     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
116     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
117     PetscFunctionReturn(0);
118   }
119   nto = 0;
120   /* send sizes and ranks for building a two-sided communcation */
121   for (i=0; i<size; i++){
122    if (tosizes_temp[i]){
123      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
124      tosizes_temp[i] = nto; /* a map from processor to index */
125      toranks[nto++]  = i; /* processor */
126    }
127   }
128   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
129   for (i=0; i<nto; i++){
130     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
131     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
132   }
133   /* send information to other processors */
134   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
135   nrecvrows = 0;
136   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
137   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
138   nrecvrows = 0;
139   for (i=0; i<nfrom; i++){
140     for (j=0; j<fromsizes[2*i]; j++){
141       remote[nrecvrows].rank    = fromranks[i];
142       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
143     }
144   }
145   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
146   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
147   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
148   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
149   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
150   /* message pair <no of is, row>  */
151   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
152   for (i=0; i<nrrows; i++){
153     owner = rrow_ranks[i]; /* processor */
154     j     = tosizes_temp[owner]; /* index */
155     todata[toffsets[j]++] = rrow_isids[i];
156     todata[toffsets[j]++] = remoterows[i];
157   }
158   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
159   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
160   ierr = PetscFree(toffsets);CHKERRQ(ierr);
161   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
162   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
163   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
164   /* send rows belonging to the remote so that then we could get the overlapping data back */
165   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
166   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
167   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
168   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
169   ierr = PetscFree(fromranks);CHKERRQ(ierr);
170   nrecvrows = 0;
171   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
172   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
173   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
174   nrecvrows = 0;
175   for (i=0; i<nto; i++){
176     for (j=0; j<tosizes[2*i]; j++){
177       remote[nrecvrows].rank    = toranks[i];
178       remote[nrecvrows++].index = tosizes[2*i+1]+j;
179     }
180   }
181   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
182   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
183   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
184   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
185   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
186   /* overlap communication and computation */
187   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
188   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
189   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
190   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
191   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
192   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
193   ierr = PetscFree(toranks);CHKERRQ(ierr);
194   ierr = PetscFree(tosizes);CHKERRQ(ierr);
195   ierr = PetscFree(todata);CHKERRQ(ierr);
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
201 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
202 {
203   PetscInt         *isz,isz_i,i,j,is_id, data_size;
204   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
205   const PetscInt   *indices_i_temp;
206   PetscErrorCode    ierr;
207 
208   PetscFunctionBegin;
209   max_lsize = 0;
210   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
211   for (i=0; i<nidx; i++){
212     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
213     max_lsize = lsize>max_lsize ? lsize:max_lsize;
214     isz[i]    = lsize;
215   }
216   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
217   for (i=0; i<nidx; i++){
218     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
219     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
220     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
221     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
222   }
223   /* retrieve information to get row id and its overlap */
224   for (i=0; i<nrecvs; ){
225     is_id      = recvdata[i++];
226     data_size  = recvdata[i++];
227     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
228     isz_i      = isz[is_id];
229     for (j=0; j< data_size; j++){
230       col = recvdata[i++];
231       indices_i[isz_i++] = col;
232     }
233     isz[is_id] = isz_i;
234   }
235   /* remove duplicate entities */
236   for (i=0; i<nidx; i++){
237     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
238     isz_i      = isz[i];
239     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
240     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
241   }
242   ierr = PetscFree(isz);CHKERRQ(ierr);
243   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
244   PetscFunctionReturn(0);
245 }
246 
247 #undef __FUNCT__
248 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
249 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
250 {
251   PetscLayout       rmap,cmap;
252   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
253   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
254   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
255   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
256   Mat               amat,bmat;
257   PetscMPIInt       rank;
258   PetscBool         done;
259   MPI_Comm          comm;
260   PetscErrorCode    ierr;
261 
262   PetscFunctionBegin;
263   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
264   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
265   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
266   /* Even if the mat is symmetric, we still assume it is not symmetric */
267   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
268   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
269   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
270   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
271   /* total number of nonzero values is used to estimate the memory usage in the next step */
272   tnz  = ai[an]+bi[bn];
273   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
274   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
275   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
276   /* to find the longest message */
277   max_fszs = 0;
278   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
279   /* better way to estimate number of nonzero in the mat??? */
280   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
281   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
282   rows_pos  = 0;
283   totalrows = 0;
284   for (i=0; i<nfrom; i++){
285     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
286     /* group data together */
287     for (j=0; j<fromsizes[2*i]; j+=2){
288       is_id                       = fromrows[rows_pos++];/* no of is */
289       rows_i                      = rows_data[is_id];
290       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
291     }
292     /* estimate a space to avoid multiple allocations  */
293     for (j=0; j<nidx; j++){
294       indvc_ij = 0;
295       rows_i   = rows_data[j];
296       for (l=0; l<rows_pos_i[j]; l++){
297         row    = rows_i[l]-rstart;
298         start  = ai[row];
299         end    = ai[row+1];
300         for (k=start; k<end; k++){ /* Amat */
301           col = aj[k] + cstart;
302           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
303         }
304         start = bi[row];
305         end   = bi[row+1];
306         for (k=start; k<end; k++) { /* Bmat */
307           col = gcols[bj[k]];
308           indices_tmp[indvc_ij++] = col;
309         }
310       }
311       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
312       indv_counts[i*nidx+j] = indvc_ij;
313       totalrows            += indvc_ij;
314     }
315   }
316   /* message triple <no of is, number of rows, rows> */
317   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
318   totalrows = 0;
319   rows_pos  = 0;
320   /* use this code again */
321   for (i=0;i<nfrom;i++){
322     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
323     for (j=0; j<fromsizes[2*i]; j+=2){
324       is_id                       = fromrows[rows_pos++];
325       rows_i                      = rows_data[is_id];
326       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
327     }
328     /* add data  */
329     for (j=0; j<nidx; j++){
330       if (!indv_counts[i*nidx+j]) continue;
331       indvc_ij = 0;
332       sbdata[totalrows++] = j;
333       sbdata[totalrows++] = indv_counts[i*nidx+j];
334       sbsizes[2*i]       += 2;
335       rows_i              = rows_data[j];
336       for (l=0; l<rows_pos_i[j]; l++){
337         row   = rows_i[l]-rstart;
338         start = ai[row];
339         end   = ai[row+1];
340         for (k=start; k<end; k++){ /* Amat */
341           col = aj[k] + cstart;
342           indices_tmp[indvc_ij++] = col;
343         }
344         start = bi[row];
345         end   = bi[row+1];
346         for (k=start; k<end; k++) { /* Bmat */
347           col = gcols[bj[k]];
348           indices_tmp[indvc_ij++] = col;
349         }
350       }
351       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
352       sbsizes[2*i]  += indvc_ij;
353       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
354       totalrows += indvc_ij;
355     }
356   }
357   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
358   for (i=0; i<nfrom; i++){
359     offsets[i+1]   = offsets[i] + sbsizes[2*i];
360     sbsizes[2*i+1] = offsets[i];
361   }
362   ierr = PetscFree(offsets);CHKERRQ(ierr);
363   if (sbrowsizes) *sbrowsizes = sbsizes;
364   if (sbrows) *sbrows = sbdata;
365   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
367   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
368   PetscFunctionReturn(0);
369 }
370 
371 #undef __FUNCT__
372 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
373 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
374 {
375   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
376   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
377   PetscInt          lsize,lsize_tmp,owner;
378   PetscMPIInt       rank;
379   Mat               amat,bmat;
380   PetscBool         done;
381   PetscLayout       cmap,rmap;
382   MPI_Comm          comm;
383   PetscErrorCode    ierr;
384 
385   PetscFunctionBegin;
386   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
387   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
388   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
389   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
390   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
391   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
392   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
393   /* is it a safe way to compute number of nonzero values ? */
394   tnz  = ai[an]+bi[bn];
395   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
397   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
398   /* it is a better way to estimate memory than the old implementation
399    * where global size of matrix is used
400    * */
401   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
402   for (i=0; i<nidx; i++) {
403     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
404     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
405     lsize_tmp = 0;
406     for (j=0; j<lsize; j++) {
407       owner = -1;
408       row   = indices[j];
409       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
410       if (owner != rank) continue;
411       /* local number */
412       row  -= rstart;
413       start = ai[row];
414       end   = ai[row+1];
415       for (k=start; k<end; k++) { /* Amat */
416         col = aj[k] + cstart;
417         indices_temp[lsize_tmp++] = col;
418       }
419       start = bi[row];
420       end   = bi[row+1];
421       for (k=start; k<end; k++) { /* Bmat */
422         col = gcols[bj[k]];
423         indices_temp[lsize_tmp++] = col;
424       }
425     }
426    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
427    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
428    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
429    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
430   }
431   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
433   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
434   PetscFunctionReturn(0);
435 }
436 
437 
438 /*
439   Sample message format:
440   If a processor A wants processor B to process some elements corresponding
441   to index sets is[1],is[5]
442   mesg [0] = 2   (no of index sets in the mesg)
443   -----------
444   mesg [1] = 1 => is[1]
445   mesg [2] = sizeof(is[1]);
446   -----------
447   mesg [3] = 5  => is[5]
448   mesg [4] = sizeof(is[5]);
449   -----------
450   mesg [5]
451   mesg [n]  datas[1]
452   -----------
453   mesg[n+1]
454   mesg[m]  data(is[5])
455   -----------
456 
457   Notes:
458   nrqs - no of requests sent (or to be sent out)
459   nrqr - no of requests recieved (which have to be or which have been processed
460 */
461 #undef __FUNCT__
462 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
463 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
464 {
465   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
466   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
467   const PetscInt **idx,*idx_i;
468   PetscInt       *n,**data,len;
469   PetscErrorCode ierr;
470   PetscMPIInt    size,rank,tag1,tag2;
471   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
472   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
473   PetscBT        *table;
474   MPI_Comm       comm;
475   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476   MPI_Status     *s_status,*recv_status;
477   char           *t_p;
478 
479   PetscFunctionBegin;
480   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
481   size = c->size;
482   rank = c->rank;
483   M    = C->rmap->N;
484 
485   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
486   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
487 
488   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
489 
490   for (i=0; i<imax; i++) {
491     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
492     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
493   }
494 
495   /* evaluate communication - mesg to who,length of mesg, and buffer space
496      required. Based on this, buffers are allocated, and data copied into them  */
497   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
498   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
499   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
500   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
501   for (i=0; i<imax; i++) {
502     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
503     idx_i = idx[i];
504     len   = n[i];
505     for (j=0; j<len; j++) {
506       row = idx_i[j];
507       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
508       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
509       w4[proc]++;
510     }
511     for (j=0; j<size; j++) {
512       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
513     }
514   }
515 
516   nrqs     = 0;              /* no of outgoing messages */
517   msz      = 0;              /* total mesg length (for all proc */
518   w1[rank] = 0;              /* no mesg sent to intself */
519   w3[rank] = 0;
520   for (i=0; i<size; i++) {
521     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
522   }
523   /* pa - is list of processors to communicate with */
524   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
525   for (i=0,j=0; i<size; i++) {
526     if (w1[i]) {pa[j] = i; j++;}
527   }
528 
529   /* Each message would have a header = 1 + 2*(no of IS) + data */
530   for (i=0; i<nrqs; i++) {
531     j      = pa[i];
532     w1[j] += w2[j] + 2*w3[j];
533     msz   += w1[j];
534   }
535 
536   /* Determine the number of messages to expect, their lengths, from from-ids */
537   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
538   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
539 
540   /* Now post the Irecvs corresponding to these messages */
541   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
542 
543   /* Allocate Memory for outgoing messages */
544   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
545   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
546   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
547 
548   {
549     PetscInt *iptr = tmp,ict  = 0;
550     for (i=0; i<nrqs; i++) {
551       j         = pa[i];
552       iptr     +=  ict;
553       outdat[j] = iptr;
554       ict       = w1[j];
555     }
556   }
557 
558   /* Form the outgoing messages */
559   /* plug in the headers */
560   for (i=0; i<nrqs; i++) {
561     j            = pa[i];
562     outdat[j][0] = 0;
563     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
564     ptr[j]       = outdat[j] + 2*w3[j] + 1;
565   }
566 
567   /* Memory for doing local proc's work */
568   {
569     PetscInt Mimax = 0,M_BPB_imax = 0;
570     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
571     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
572     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
573     for (i=0; i<imax; i++) {
574       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
575       data[i]  = d_p + M*i;
576     }
577   }
578 
579   /* Parse the IS and update local tables and the outgoing buf with the data */
580   {
581     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
582     PetscBT  table_i;
583 
584     for (i=0; i<imax; i++) {
585       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
586       n_i     = n[i];
587       table_i = table[i];
588       idx_i   = idx[i];
589       data_i  = data[i];
590       isz_i   = isz[i];
591       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
592         row  = idx_i[j];
593         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
594         if (proc != rank) { /* copy to the outgoing buffer */
595           ctr[proc]++;
596           *ptr[proc] = row;
597           ptr[proc]++;
598         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
599       }
600       /* Update the headers for the current IS */
601       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
602         if ((ctr_j = ctr[j])) {
603           outdat_j        = outdat[j];
604           k               = ++outdat_j[0];
605           outdat_j[2*k]   = ctr_j;
606           outdat_j[2*k-1] = i;
607         }
608       }
609       isz[i] = isz_i;
610     }
611   }
612 
613   /*  Now  post the sends */
614   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
615   for (i=0; i<nrqs; ++i) {
616     j    = pa[i];
617     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
618   }
619 
620   /* No longer need the original indices */
621   for (i=0; i<imax; ++i) {
622     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
623   }
624   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
625 
626   for (i=0; i<imax; ++i) {
627     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
628   }
629 
630   /* Do Local work */
631   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
632 
633   /* Receive messages */
634   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
635   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
636 
637   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
638   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
639 
640   /* Phase 1 sends are complete - deallocate buffers */
641   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
642   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
643 
644   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
645   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
646   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
647   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
648   ierr = PetscFree(rbuf);CHKERRQ(ierr);
649 
650 
651   /* Send the data back */
652   /* Do a global reduction to know the buffer space req for incoming messages */
653   {
654     PetscMPIInt *rw1;
655 
656     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
657 
658     for (i=0; i<nrqr; ++i) {
659       proc = recv_status[i].MPI_SOURCE;
660 
661       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
662       rw1[proc] = isz1[i];
663     }
664     ierr = PetscFree(onodes1);CHKERRQ(ierr);
665     ierr = PetscFree(olengths1);CHKERRQ(ierr);
666 
667     /* Determine the number of messages to expect, their lengths, from from-ids */
668     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
669     ierr = PetscFree(rw1);CHKERRQ(ierr);
670   }
671   /* Now post the Irecvs corresponding to these messages */
672   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
673 
674   /* Now  post the sends */
675   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
676   for (i=0; i<nrqr; ++i) {
677     j    = recv_status[i].MPI_SOURCE;
678     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
679   }
680 
681   /* receive work done on other processors */
682   {
683     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
684     PetscMPIInt idex;
685     PetscBT     table_i;
686     MPI_Status  *status2;
687 
688     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
689     for (i=0; i<nrqs; ++i) {
690       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
691       /* Process the message */
692       rbuf2_i = rbuf2[idex];
693       ct1     = 2*rbuf2_i[0]+1;
694       jmax    = rbuf2[idex][0];
695       for (j=1; j<=jmax; j++) {
696         max     = rbuf2_i[2*j];
697         is_no   = rbuf2_i[2*j-1];
698         isz_i   = isz[is_no];
699         data_i  = data[is_no];
700         table_i = table[is_no];
701         for (k=0; k<max; k++,ct1++) {
702           row = rbuf2_i[ct1];
703           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
704         }
705         isz[is_no] = isz_i;
706       }
707     }
708 
709     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
710     ierr = PetscFree(status2);CHKERRQ(ierr);
711   }
712 
713   for (i=0; i<imax; ++i) {
714     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
715   }
716 
717   ierr = PetscFree(onodes2);CHKERRQ(ierr);
718   ierr = PetscFree(olengths2);CHKERRQ(ierr);
719 
720   ierr = PetscFree(pa);CHKERRQ(ierr);
721   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
722   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
723   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
724   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
725   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
726   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
727   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
728   ierr = PetscFree(s_status);CHKERRQ(ierr);
729   ierr = PetscFree(recv_status);CHKERRQ(ierr);
730   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
731   ierr = PetscFree(xdata);CHKERRQ(ierr);
732   ierr = PetscFree(isz1);CHKERRQ(ierr);
733   PetscFunctionReturn(0);
734 }
735 
736 #undef __FUNCT__
737 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
738 /*
739    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
740        the work on the local processor.
741 
742      Inputs:
743       C      - MAT_MPIAIJ;
744       imax - total no of index sets processed at a time;
745       table  - an array of char - size = m bits.
746 
747      Output:
748       isz    - array containing the count of the solution elements corresponding
749                to each index set;
750       data   - pointer to the solutions
751 */
752 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
753 {
754   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
755   Mat        A  = c->A,B = c->B;
756   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
757   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
758   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
759   PetscBT    table_i;
760 
761   PetscFunctionBegin;
762   rstart = C->rmap->rstart;
763   cstart = C->cmap->rstart;
764   ai     = a->i;
765   aj     = a->j;
766   bi     = b->i;
767   bj     = b->j;
768   garray = c->garray;
769 
770 
771   for (i=0; i<imax; i++) {
772     data_i  = data[i];
773     table_i = table[i];
774     isz_i   = isz[i];
775     for (j=0,max=isz[i]; j<max; j++) {
776       row   = data_i[j] - rstart;
777       start = ai[row];
778       end   = ai[row+1];
779       for (k=start; k<end; k++) { /* Amat */
780         val = aj[k] + cstart;
781         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
782       }
783       start = bi[row];
784       end   = bi[row+1];
785       for (k=start; k<end; k++) { /* Bmat */
786         val = garray[bj[k]];
787         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
788       }
789     }
790     isz[i] = isz_i;
791   }
792   PetscFunctionReturn(0);
793 }
794 
795 #undef __FUNCT__
796 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
797 /*
798       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
799          and return the output
800 
801          Input:
802            C    - the matrix
803            nrqr - no of messages being processed.
804            rbuf - an array of pointers to the recieved requests
805 
806          Output:
807            xdata - array of messages to be sent back
808            isz1  - size of each message
809 
810   For better efficiency perhaps we should malloc separately each xdata[i],
811 then if a remalloc is required we need only copy the data for that one row
812 rather then all previous rows as it is now where a single large chunck of
813 memory is used.
814 
815 */
816 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
817 {
818   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
819   Mat            A  = c->A,B = c->B;
820   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
821   PetscErrorCode ierr;
822   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
823   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
824   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
825   PetscInt       *rbuf_i,kmax,rbuf_0;
826   PetscBT        xtable;
827 
828   PetscFunctionBegin;
829   m      = C->rmap->N;
830   rstart = C->rmap->rstart;
831   cstart = C->cmap->rstart;
832   ai     = a->i;
833   aj     = a->j;
834   bi     = b->i;
835   bj     = b->j;
836   garray = c->garray;
837 
838 
839   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
840     rbuf_i =  rbuf[i];
841     rbuf_0 =  rbuf_i[0];
842     ct    += rbuf_0;
843     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
844   }
845 
846   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
847   else max1 = 1;
848   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
849   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
850   ++no_malloc;
851   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
852   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
853 
854   ct3 = 0;
855   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
856     rbuf_i =  rbuf[i];
857     rbuf_0 =  rbuf_i[0];
858     ct1    =  2*rbuf_0+1;
859     ct2    =  ct1;
860     ct3   += ct1;
861     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
862       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
863       oct2 = ct2;
864       kmax = rbuf_i[2*j];
865       for (k=0; k<kmax; k++,ct1++) {
866         row = rbuf_i[ct1];
867         if (!PetscBTLookupSet(xtable,row)) {
868           if (!(ct3 < mem_estimate)) {
869             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
870             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
871             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
872             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
873             xdata[0]     = tmp;
874             mem_estimate = new_estimate; ++no_malloc;
875             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
876           }
877           xdata[i][ct2++] = row;
878           ct3++;
879         }
880       }
881       for (k=oct2,max2=ct2; k<max2; k++) {
882         row   = xdata[i][k] - rstart;
883         start = ai[row];
884         end   = ai[row+1];
885         for (l=start; l<end; l++) {
886           val = aj[l] + cstart;
887           if (!PetscBTLookupSet(xtable,val)) {
888             if (!(ct3 < mem_estimate)) {
889               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
890               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
891               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
892               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
893               xdata[0]     = tmp;
894               mem_estimate = new_estimate; ++no_malloc;
895               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
896             }
897             xdata[i][ct2++] = val;
898             ct3++;
899           }
900         }
901         start = bi[row];
902         end   = bi[row+1];
903         for (l=start; l<end; l++) {
904           val = garray[bj[l]];
905           if (!PetscBTLookupSet(xtable,val)) {
906             if (!(ct3 < mem_estimate)) {
907               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
908               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
909               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
910               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
911               xdata[0]     = tmp;
912               mem_estimate = new_estimate; ++no_malloc;
913               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
914             }
915             xdata[i][ct2++] = val;
916             ct3++;
917           }
918         }
919       }
920       /* Update the header*/
921       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
922       xdata[i][2*j-1] = rbuf_i[2*j-1];
923     }
924     xdata[i][0] = rbuf_0;
925     xdata[i+1]  = xdata[i] + ct2;
926     isz1[i]     = ct2; /* size of each message */
927   }
928   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
929   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
930   PetscFunctionReturn(0);
931 }
932 /* -------------------------------------------------------------------------*/
933 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
934 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
935 /*
936     Every processor gets the entire matrix
937 */
938 #undef __FUNCT__
939 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
940 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
941 {
942   Mat            B;
943   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
944   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
945   PetscErrorCode ierr;
946   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
947   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
948   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
949   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
950 
951   PetscFunctionBegin;
952   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
953   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
954 
955   if (scall == MAT_INITIAL_MATRIX) {
956     /* ----------------------------------------------------------------
957          Tell every processor the number of nonzeros per row
958     */
959     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
960     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
961       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
962     }
963     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
964     for (i=0; i<size; i++) {
965       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
966       displs[i]     = A->rmap->range[i];
967     }
968 #if defined(PETSC_HAVE_MPI_IN_PLACE)
969     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
970 #else
971     sendcount = A->rmap->rend - A->rmap->rstart;
972     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
973 #endif
974     /* ---------------------------------------------------------------
975          Create the sequential matrix of the same type as the local block diagonal
976     */
977     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
978     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
979     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
980     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
981     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
982     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
983     **Bin = B;
984     b     = (Mat_SeqAIJ*)B->data;
985 
986     /*--------------------------------------------------------------------
987        Copy my part of matrix column indices over
988     */
989     sendcount  = ad->nz + bd->nz;
990     jsendbuf   = b->j + b->i[rstarts[rank]];
991     a_jsendbuf = ad->j;
992     b_jsendbuf = bd->j;
993     n          = A->rmap->rend - A->rmap->rstart;
994     cnt        = 0;
995     for (i=0; i<n; i++) {
996 
997       /* put in lower diagonal portion */
998       m = bd->i[i+1] - bd->i[i];
999       while (m > 0) {
1000         /* is it above diagonal (in bd (compressed) numbering) */
1001         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1002         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1003         m--;
1004       }
1005 
1006       /* put in diagonal portion */
1007       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1008         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1009       }
1010 
1011       /* put in upper diagonal portion */
1012       while (m-- > 0) {
1013         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1014       }
1015     }
1016     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1017 
1018     /*--------------------------------------------------------------------
1019        Gather all column indices to all processors
1020     */
1021     for (i=0; i<size; i++) {
1022       recvcounts[i] = 0;
1023       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1024         recvcounts[i] += lens[j];
1025       }
1026     }
1027     displs[0] = 0;
1028     for (i=1; i<size; i++) {
1029       displs[i] = displs[i-1] + recvcounts[i-1];
1030     }
1031 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1032     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1033 #else
1034     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1035 #endif
1036     /*--------------------------------------------------------------------
1037         Assemble the matrix into useable form (note numerical values not yet set)
1038     */
1039     /* set the b->ilen (length of each row) values */
1040     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1041     /* set the b->i indices */
1042     b->i[0] = 0;
1043     for (i=1; i<=A->rmap->N; i++) {
1044       b->i[i] = b->i[i-1] + lens[i-1];
1045     }
1046     ierr = PetscFree(lens);CHKERRQ(ierr);
1047     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1048     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1049 
1050   } else {
1051     B = **Bin;
1052     b = (Mat_SeqAIJ*)B->data;
1053   }
1054 
1055   /*--------------------------------------------------------------------
1056        Copy my part of matrix numerical values into the values location
1057   */
1058   if (flag == MAT_GET_VALUES) {
1059     sendcount = ad->nz + bd->nz;
1060     sendbuf   = b->a + b->i[rstarts[rank]];
1061     a_sendbuf = ad->a;
1062     b_sendbuf = bd->a;
1063     b_sendj   = bd->j;
1064     n         = A->rmap->rend - A->rmap->rstart;
1065     cnt       = 0;
1066     for (i=0; i<n; i++) {
1067 
1068       /* put in lower diagonal portion */
1069       m = bd->i[i+1] - bd->i[i];
1070       while (m > 0) {
1071         /* is it above diagonal (in bd (compressed) numbering) */
1072         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1073         sendbuf[cnt++] = *b_sendbuf++;
1074         m--;
1075         b_sendj++;
1076       }
1077 
1078       /* put in diagonal portion */
1079       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1080         sendbuf[cnt++] = *a_sendbuf++;
1081       }
1082 
1083       /* put in upper diagonal portion */
1084       while (m-- > 0) {
1085         sendbuf[cnt++] = *b_sendbuf++;
1086         b_sendj++;
1087       }
1088     }
1089     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1090 
1091     /* -----------------------------------------------------------------
1092        Gather all numerical values to all processors
1093     */
1094     if (!recvcounts) {
1095       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1096     }
1097     for (i=0; i<size; i++) {
1098       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1099     }
1100     displs[0] = 0;
1101     for (i=1; i<size; i++) {
1102       displs[i] = displs[i-1] + recvcounts[i-1];
1103     }
1104     recvbuf = b->a;
1105 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1106     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1107 #else
1108     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1109 #endif
1110   }  /* endof (flag == MAT_GET_VALUES) */
1111   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1112 
1113   if (A->symmetric) {
1114     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1115   } else if (A->hermitian) {
1116     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1117   } else if (A->structurally_symmetric) {
1118     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1119   }
1120   PetscFunctionReturn(0);
1121 }
1122 
1123 
1124 
1125 #undef __FUNCT__
1126 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
1127 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1128 {
1129   PetscErrorCode ierr;
1130   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1131   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1132 
1133   PetscFunctionBegin;
1134 
1135   /*
1136        Check for special case: each processor gets entire matrix
1137   */
1138   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1139     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1140     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1141     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1142     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1143     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1144       wantallmatrix = PETSC_TRUE;
1145 
1146       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1147     }
1148   }
1149   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1150   if (twantallmatrix) {
1151     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
1152     PetscFunctionReturn(0);
1153   }
1154 
1155   /* Allocate memory to hold all the submatrices */
1156   if (scall != MAT_REUSE_MATRIX) {
1157     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
1158   }
1159 
1160   /* Check for special case: each processor gets entire matrix columns */
1161   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
1162   for (i=0; i<ismax; i++) {
1163     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
1164     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
1165     if (colflag && ncol == C->cmap->N) {
1166       allcolumns[i] = PETSC_TRUE;
1167     } else {
1168       allcolumns[i] = PETSC_FALSE;
1169     }
1170   }
1171 
1172   /* Determine the number of stages through which submatrices are done */
1173   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1174 
1175   /*
1176      Each stage will extract nmax submatrices.
1177      nmax is determined by the matrix column dimension.
1178      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1179   */
1180   if (!nmax) nmax = 1;
1181   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1182 
1183   /* Make sure every processor loops through the nstages */
1184   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1185 
1186   for (i=0,pos=0; i<nstages; i++) {
1187     if (pos+nmax <= ismax) max_no = nmax;
1188     else if (pos == ismax) max_no = 0;
1189     else                   max_no = ismax-pos;
1190     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
1191     pos += max_no;
1192   }
1193 
1194   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
1195   PetscFunctionReturn(0);
1196 }
1197 
1198 /* -------------------------------------------------------------------------*/
1199 #undef __FUNCT__
1200 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
1201 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1202 {
1203   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1204   Mat            A  = c->A;
1205   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1206   const PetscInt **icol,**irow;
1207   PetscInt       *nrow,*ncol,start;
1208   PetscErrorCode ierr;
1209   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1210   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1211   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1212   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1213   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1214 #if defined(PETSC_USE_CTABLE)
1215   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1216 #else
1217   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1218 #endif
1219   const PetscInt *irow_i;
1220   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1221   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1222   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1223   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1224   MPI_Status     *r_status3,*r_status4,*s_status4;
1225   MPI_Comm       comm;
1226   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1227   PetscMPIInt    *onodes1,*olengths1;
1228   PetscMPIInt    idex,idex2,end;
1229 
1230   PetscFunctionBegin;
1231   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1232   tag0 = ((PetscObject)C)->tag;
1233   size = c->size;
1234   rank = c->rank;
1235 
1236   /* Get some new tags to keep the communication clean */
1237   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
1238   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1239   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1240 
1241   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
1242 
1243   for (i=0; i<ismax; i++) {
1244     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
1245     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
1246     if (allcolumns[i]) {
1247       icol[i] = NULL;
1248       ncol[i] = C->cmap->N;
1249     } else {
1250       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
1251       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
1252     }
1253   }
1254 
1255   /* evaluate communication - mesg to who, length of mesg, and buffer space
1256      required. Based on this, buffers are allocated, and data copied into them*/
1257   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
1258   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1259   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1260   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
1261   for (i=0; i<ismax; i++) {
1262     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
1263     jmax   = nrow[i];
1264     irow_i = irow[i];
1265     for (j=0; j<jmax; j++) {
1266       l   = 0;
1267       row = irow_i[j];
1268       while (row >= C->rmap->range[l+1]) l++;
1269       proc = l;
1270       w4[proc]++;
1271     }
1272     for (j=0; j<size; j++) {
1273       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1274     }
1275   }
1276 
1277   nrqs     = 0;              /* no of outgoing messages */
1278   msz      = 0;              /* total mesg length (for all procs) */
1279   w1[rank] = 0;              /* no mesg sent to self */
1280   w3[rank] = 0;
1281   for (i=0; i<size; i++) {
1282     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1283   }
1284   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1285   for (i=0,j=0; i<size; i++) {
1286     if (w1[i]) { pa[j] = i; j++; }
1287   }
1288 
1289   /* Each message would have a header = 1 + 2*(no of IS) + data */
1290   for (i=0; i<nrqs; i++) {
1291     j      = pa[i];
1292     w1[j] += w2[j] + 2* w3[j];
1293     msz   += w1[j];
1294   }
1295   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1296 
1297   /* Determine the number of messages to expect, their lengths, from from-ids */
1298   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1299   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1300 
1301   /* Now post the Irecvs corresponding to these messages */
1302   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1303 
1304   ierr = PetscFree(onodes1);CHKERRQ(ierr);
1305   ierr = PetscFree(olengths1);CHKERRQ(ierr);
1306 
1307   /* Allocate Memory for outgoing messages */
1308   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1309   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1310   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1311 
1312   {
1313     PetscInt *iptr = tmp,ict = 0;
1314     for (i=0; i<nrqs; i++) {
1315       j        = pa[i];
1316       iptr    += ict;
1317       sbuf1[j] = iptr;
1318       ict      = w1[j];
1319     }
1320   }
1321 
1322   /* Form the outgoing messages */
1323   /* Initialize the header space */
1324   for (i=0; i<nrqs; i++) {
1325     j           = pa[i];
1326     sbuf1[j][0] = 0;
1327     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
1328     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1329   }
1330 
1331   /* Parse the isrow and copy data into outbuf */
1332   for (i=0; i<ismax; i++) {
1333     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1334     irow_i = irow[i];
1335     jmax   = nrow[i];
1336     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1337       l   = 0;
1338       row = irow_i[j];
1339       while (row >= C->rmap->range[l+1]) l++;
1340       proc = l;
1341       if (proc != rank) { /* copy to the outgoing buf*/
1342         ctr[proc]++;
1343         *ptr[proc] = row;
1344         ptr[proc]++;
1345       }
1346     }
1347     /* Update the headers for the current IS */
1348     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1349       if ((ctr_j = ctr[j])) {
1350         sbuf1_j        = sbuf1[j];
1351         k              = ++sbuf1_j[0];
1352         sbuf1_j[2*k]   = ctr_j;
1353         sbuf1_j[2*k-1] = i;
1354       }
1355     }
1356   }
1357 
1358   /*  Now  post the sends */
1359   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1360   for (i=0; i<nrqs; ++i) {
1361     j    = pa[i];
1362     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
1363   }
1364 
1365   /* Post Receives to capture the buffer size */
1366   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
1367   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
1368   rbuf2[0] = tmp + msz;
1369   for (i=1; i<nrqs; ++i) {
1370     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1371   }
1372   for (i=0; i<nrqs; ++i) {
1373     j    = pa[i];
1374     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
1375   }
1376 
1377   /* Send to other procs the buf size they should allocate */
1378 
1379 
1380   /* Receive messages*/
1381   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
1382   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1383   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
1384   {
1385     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1386     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1387     PetscInt   *sbuf2_i;
1388 
1389     for (i=0; i<nrqr; ++i) {
1390       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1391 
1392       req_size[idex] = 0;
1393       rbuf1_i        = rbuf1[idex];
1394       start          = 2*rbuf1_i[0] + 1;
1395       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1396       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
1397       sbuf2_i        = sbuf2[idex];
1398       for (j=start; j<end; j++) {
1399         id              = rbuf1_i[j] - rstart;
1400         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1401         sbuf2_i[j]      = ncols;
1402         req_size[idex] += ncols;
1403       }
1404       req_source[idex] = r_status1[i].MPI_SOURCE;
1405       /* form the header */
1406       sbuf2_i[0] = req_size[idex];
1407       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1408 
1409       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1410     }
1411   }
1412   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1413   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1414 
1415   /*  recv buffer sizes */
1416   /* Receive messages*/
1417 
1418   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1419   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1420   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1421   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1422   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1423 
1424   for (i=0; i<nrqs; ++i) {
1425     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1426     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1427     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1428     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1429     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1430   }
1431   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1432   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1433 
1434   /* Wait on sends1 and sends2 */
1435   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1436   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1437 
1438   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1439   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1440   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1441   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1442   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1443   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1444 
1445   /* Now allocate buffers for a->j, and send them off */
1446   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1447   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1448   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1449   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1450 
1451   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1452   {
1453     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1454     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1455     PetscInt cend = C->cmap->rend;
1456     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1457 
1458     for (i=0; i<nrqr; i++) {
1459       rbuf1_i   = rbuf1[i];
1460       sbuf_aj_i = sbuf_aj[i];
1461       ct1       = 2*rbuf1_i[0] + 1;
1462       ct2       = 0;
1463       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1464         kmax = rbuf1[i][2*j];
1465         for (k=0; k<kmax; k++,ct1++) {
1466           row    = rbuf1_i[ct1] - rstart;
1467           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1468           ncols  = nzA + nzB;
1469           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1470 
1471           /* load the column indices for this row into cols*/
1472           cols = sbuf_aj_i + ct2;
1473 
1474           lwrite = 0;
1475           for (l=0; l<nzB; l++) {
1476             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1477           }
1478           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1479           for (l=0; l<nzB; l++) {
1480             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1481           }
1482 
1483           ct2 += ncols;
1484         }
1485       }
1486       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1487     }
1488   }
1489   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1490   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1491 
1492   /* Allocate buffers for a->a, and send them off */
1493   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1494   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1495   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1496   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1497 
1498   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1499   {
1500     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1501     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1502     PetscInt    cend   = C->cmap->rend;
1503     PetscInt    *b_j   = b->j;
1504     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1505 
1506     for (i=0; i<nrqr; i++) {
1507       rbuf1_i   = rbuf1[i];
1508       sbuf_aa_i = sbuf_aa[i];
1509       ct1       = 2*rbuf1_i[0]+1;
1510       ct2       = 0;
1511       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1512         kmax = rbuf1_i[2*j];
1513         for (k=0; k<kmax; k++,ct1++) {
1514           row    = rbuf1_i[ct1] - rstart;
1515           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1516           ncols  = nzA + nzB;
1517           cworkB = b_j + b_i[row];
1518           vworkA = a_a + a_i[row];
1519           vworkB = b_a + b_i[row];
1520 
1521           /* load the column values for this row into vals*/
1522           vals = sbuf_aa_i+ct2;
1523 
1524           lwrite = 0;
1525           for (l=0; l<nzB; l++) {
1526             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1527           }
1528           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1529           for (l=0; l<nzB; l++) {
1530             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1531           }
1532 
1533           ct2 += ncols;
1534         }
1535       }
1536       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1537     }
1538   }
1539   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1540   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1541   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1542   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1543 
1544   /* Form the matrix */
1545   /* create col map: global col of C -> local col of submatrices */
1546   {
1547     const PetscInt *icol_i;
1548 #if defined(PETSC_USE_CTABLE)
1549     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1550     for (i=0; i<ismax; i++) {
1551       if (!allcolumns[i]) {
1552         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1553 
1554         jmax   = ncol[i];
1555         icol_i = icol[i];
1556         cmap_i = cmap[i];
1557         for (j=0; j<jmax; j++) {
1558           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1559         }
1560       } else {
1561         cmap[i] = NULL;
1562       }
1563     }
1564 #else
1565     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1566     for (i=0; i<ismax; i++) {
1567       if (!allcolumns[i]) {
1568         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1569         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1570         jmax   = ncol[i];
1571         icol_i = icol[i];
1572         cmap_i = cmap[i];
1573         for (j=0; j<jmax; j++) {
1574           cmap_i[icol_i[j]] = j+1;
1575         }
1576       } else {
1577         cmap[i] = NULL;
1578       }
1579     }
1580 #endif
1581   }
1582 
1583   /* Create lens which is required for MatCreate... */
1584   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1585   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1586   if (ismax) {
1587     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1588     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1589   }
1590   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1591 
1592   /* Update lens from local data */
1593   for (i=0; i<ismax; i++) {
1594     jmax = nrow[i];
1595     if (!allcolumns[i]) cmap_i = cmap[i];
1596     irow_i = irow[i];
1597     lens_i = lens[i];
1598     for (j=0; j<jmax; j++) {
1599       l   = 0;
1600       row = irow_i[j];
1601       while (row >= C->rmap->range[l+1]) l++;
1602       proc = l;
1603       if (proc == rank) {
1604         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1605         if (!allcolumns[i]) {
1606           for (k=0; k<ncols; k++) {
1607 #if defined(PETSC_USE_CTABLE)
1608             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1609 #else
1610             tcol = cmap_i[cols[k]];
1611 #endif
1612             if (tcol) lens_i[j]++;
1613           }
1614         } else { /* allcolumns */
1615           lens_i[j] = ncols;
1616         }
1617         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1618       }
1619     }
1620   }
1621 
1622   /* Create row map: global row of C -> local row of submatrices */
1623 #if defined(PETSC_USE_CTABLE)
1624   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1625   for (i=0; i<ismax; i++) {
1626     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1627     irow_i = irow[i];
1628     jmax   = nrow[i];
1629     for (j=0; j<jmax; j++) {
1630       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1631     }
1632   }
1633 #else
1634   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1635   if (ismax) {
1636     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1637     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1638   }
1639   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1640   for (i=0; i<ismax; i++) {
1641     rmap_i = rmap[i];
1642     irow_i = irow[i];
1643     jmax   = nrow[i];
1644     for (j=0; j<jmax; j++) {
1645       rmap_i[irow_i[j]] = j;
1646     }
1647   }
1648 #endif
1649 
1650   /* Update lens from offproc data */
1651   {
1652     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1653 
1654     for (tmp2=0; tmp2<nrqs; tmp2++) {
1655       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1656       idex    = pa[idex2];
1657       sbuf1_i = sbuf1[idex];
1658       jmax    = sbuf1_i[0];
1659       ct1     = 2*jmax+1;
1660       ct2     = 0;
1661       rbuf2_i = rbuf2[idex2];
1662       rbuf3_i = rbuf3[idex2];
1663       for (j=1; j<=jmax; j++) {
1664         is_no  = sbuf1_i[2*j-1];
1665         max1   = sbuf1_i[2*j];
1666         lens_i = lens[is_no];
1667         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1668         rmap_i = rmap[is_no];
1669         for (k=0; k<max1; k++,ct1++) {
1670 #if defined(PETSC_USE_CTABLE)
1671           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1672           row--;
1673           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1674 #else
1675           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1676 #endif
1677           max2 = rbuf2_i[ct1];
1678           for (l=0; l<max2; l++,ct2++) {
1679             if (!allcolumns[is_no]) {
1680 #if defined(PETSC_USE_CTABLE)
1681               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1682 #else
1683               tcol = cmap_i[rbuf3_i[ct2]];
1684 #endif
1685               if (tcol) lens_i[row]++;
1686             } else { /* allcolumns */
1687               lens_i[row]++; /* lens_i[row] += max2 ? */
1688             }
1689           }
1690         }
1691       }
1692     }
1693   }
1694   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1695   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1696   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1697   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1698   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1699 
1700   /* Create the submatrices */
1701   if (scall == MAT_REUSE_MATRIX) {
1702     PetscBool flag;
1703 
1704     /*
1705         Assumes new rows are same length as the old rows,hence bug!
1706     */
1707     for (i=0; i<ismax; i++) {
1708       mat = (Mat_SeqAIJ*)(submats[i]->data);
1709       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1710       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1711       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1712       /* Initial matrix as if empty */
1713       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1714 
1715       submats[i]->factortype = C->factortype;
1716     }
1717   } else {
1718     for (i=0; i<ismax; i++) {
1719       PetscInt rbs,cbs;
1720       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1721       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1722 
1723       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1724       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1725 
1726       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1727       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1728       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1729     }
1730   }
1731 
1732   /* Assemble the matrices */
1733   /* First assemble the local rows */
1734   {
1735     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1736     PetscScalar *imat_a;
1737 
1738     for (i=0; i<ismax; i++) {
1739       mat       = (Mat_SeqAIJ*)submats[i]->data;
1740       imat_ilen = mat->ilen;
1741       imat_j    = mat->j;
1742       imat_i    = mat->i;
1743       imat_a    = mat->a;
1744 
1745       if (!allcolumns[i]) cmap_i = cmap[i];
1746       rmap_i = rmap[i];
1747       irow_i = irow[i];
1748       jmax   = nrow[i];
1749       for (j=0; j<jmax; j++) {
1750         l   = 0;
1751         row = irow_i[j];
1752         while (row >= C->rmap->range[l+1]) l++;
1753         proc = l;
1754         if (proc == rank) {
1755           old_row = row;
1756 #if defined(PETSC_USE_CTABLE)
1757           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1758           row--;
1759 #else
1760           row = rmap_i[row];
1761 #endif
1762           ilen_row = imat_ilen[row];
1763           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1764           mat_i    = imat_i[row];
1765           mat_a    = imat_a + mat_i;
1766           mat_j    = imat_j + mat_i;
1767           if (!allcolumns[i]) {
1768             for (k=0; k<ncols; k++) {
1769 #if defined(PETSC_USE_CTABLE)
1770               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1771 #else
1772               tcol = cmap_i[cols[k]];
1773 #endif
1774               if (tcol) {
1775                 *mat_j++ = tcol - 1;
1776                 *mat_a++ = vals[k];
1777                 ilen_row++;
1778               }
1779             }
1780           } else { /* allcolumns */
1781             for (k=0; k<ncols; k++) {
1782               *mat_j++ = cols[k];  /* global col index! */
1783               *mat_a++ = vals[k];
1784               ilen_row++;
1785             }
1786           }
1787           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1788 
1789           imat_ilen[row] = ilen_row;
1790         }
1791       }
1792     }
1793   }
1794 
1795   /*   Now assemble the off proc rows*/
1796   {
1797     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1798     PetscInt    *imat_j,*imat_i;
1799     PetscScalar *imat_a,*rbuf4_i;
1800 
1801     for (tmp2=0; tmp2<nrqs; tmp2++) {
1802       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1803       idex    = pa[idex2];
1804       sbuf1_i = sbuf1[idex];
1805       jmax    = sbuf1_i[0];
1806       ct1     = 2*jmax + 1;
1807       ct2     = 0;
1808       rbuf2_i = rbuf2[idex2];
1809       rbuf3_i = rbuf3[idex2];
1810       rbuf4_i = rbuf4[idex2];
1811       for (j=1; j<=jmax; j++) {
1812         is_no     = sbuf1_i[2*j-1];
1813         rmap_i    = rmap[is_no];
1814         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1815         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1816         imat_ilen = mat->ilen;
1817         imat_j    = mat->j;
1818         imat_i    = mat->i;
1819         imat_a    = mat->a;
1820         max1      = sbuf1_i[2*j];
1821         for (k=0; k<max1; k++,ct1++) {
1822           row = sbuf1_i[ct1];
1823 #if defined(PETSC_USE_CTABLE)
1824           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1825           row--;
1826 #else
1827           row = rmap_i[row];
1828 #endif
1829           ilen  = imat_ilen[row];
1830           mat_i = imat_i[row];
1831           mat_a = imat_a + mat_i;
1832           mat_j = imat_j + mat_i;
1833           max2  = rbuf2_i[ct1];
1834           if (!allcolumns[is_no]) {
1835             for (l=0; l<max2; l++,ct2++) {
1836 
1837 #if defined(PETSC_USE_CTABLE)
1838               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1839 #else
1840               tcol = cmap_i[rbuf3_i[ct2]];
1841 #endif
1842               if (tcol) {
1843                 *mat_j++ = tcol - 1;
1844                 *mat_a++ = rbuf4_i[ct2];
1845                 ilen++;
1846               }
1847             }
1848           } else { /* allcolumns */
1849             for (l=0; l<max2; l++,ct2++) {
1850               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1851               *mat_a++ = rbuf4_i[ct2];
1852               ilen++;
1853             }
1854           }
1855           imat_ilen[row] = ilen;
1856         }
1857       }
1858     }
1859   }
1860 
1861   /* sort the rows */
1862   {
1863     PetscInt    *imat_ilen,*imat_j,*imat_i;
1864     PetscScalar *imat_a;
1865 
1866     for (i=0; i<ismax; i++) {
1867       mat       = (Mat_SeqAIJ*)submats[i]->data;
1868       imat_j    = mat->j;
1869       imat_i    = mat->i;
1870       imat_a    = mat->a;
1871       imat_ilen = mat->ilen;
1872 
1873       if (allcolumns[i]) continue;
1874       jmax = nrow[i];
1875       for (j=0; j<jmax; j++) {
1876         PetscInt ilen;
1877 
1878         mat_i = imat_i[j];
1879         mat_a = imat_a + mat_i;
1880         mat_j = imat_j + mat_i;
1881         ilen  = imat_ilen[j];
1882         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
1883       }
1884     }
1885   }
1886 
1887   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1888   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1889   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1890   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1891   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1892 
1893   /* Restore the indices */
1894   for (i=0; i<ismax; i++) {
1895     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1896     if (!allcolumns[i]) {
1897       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1898     }
1899   }
1900 
1901   /* Destroy allocated memory */
1902   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1903   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1904   ierr = PetscFree(pa);CHKERRQ(ierr);
1905 
1906   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1907   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1908   for (i=0; i<nrqr; ++i) {
1909     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1910   }
1911   for (i=0; i<nrqs; ++i) {
1912     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1913     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1914   }
1915 
1916   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1917   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1918   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1919   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1920   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1921   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1922   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1923 
1924 #if defined(PETSC_USE_CTABLE)
1925   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1926 #else
1927   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1928 #endif
1929   ierr = PetscFree(rmap);CHKERRQ(ierr);
1930 
1931   for (i=0; i<ismax; i++) {
1932     if (!allcolumns[i]) {
1933 #if defined(PETSC_USE_CTABLE)
1934       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1935 #else
1936       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1937 #endif
1938     }
1939   }
1940   ierr = PetscFree(cmap);CHKERRQ(ierr);
1941   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1942   ierr = PetscFree(lens);CHKERRQ(ierr);
1943 
1944   for (i=0; i<ismax; i++) {
1945     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1946     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1947   }
1948   PetscFunctionReturn(0);
1949 }
1950 
1951 /*
1952  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
1953  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
1954  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
1955  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
1956  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
1957  state, and needs to be "assembled" later by compressing B's column space.
1958 
1959  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
1960  Following this call, C->A & C->B have been created, even if empty.
1961  */
1962 #undef __FUNCT__
1963 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
1964 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
1965 {
1966   /* If making this function public, change the error returned in this function away from _PLIB. */
1967   PetscErrorCode ierr;
1968   Mat_MPIAIJ     *aij;
1969   Mat_SeqAIJ     *Baij;
1970   PetscBool      seqaij,Bdisassembled;
1971   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
1972   PetscScalar    v;
1973   const PetscInt *rowindices,*colindices;
1974 
1975   PetscFunctionBegin;
1976   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
1977   if (A) {
1978     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
1979     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
1980     if (rowemb) {
1981       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
1982       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
1983     } else {
1984       if (C->rmap->n != A->rmap->n) {
1985 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
1986       }
1987     }
1988     if (dcolemb) {
1989       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
1990       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
1991     } else {
1992       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
1993     }
1994   }
1995   if (B) {
1996     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
1997     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
1998     if (rowemb) {
1999       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2000       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2001     } else {
2002       if (C->rmap->n != B->rmap->n) {
2003 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2004       }
2005     }
2006     if (ocolemb) {
2007       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2008       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2009     } else {
2010       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2011     }
2012   }
2013 
2014   aij    = (Mat_MPIAIJ*)(C->data);
2015   if (!aij->A) {
2016     /* Mimic parts of MatMPIAIJSetPreallocation() */
2017     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2018     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2019     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2020     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2021     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2022   }
2023   if (A) {
2024     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2025   } else {
2026     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2027   }
2028   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2029     /*
2030       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2031       need to "disassemble" B -- convert it to using C's global indices.
2032       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2033 
2034       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2035       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2036 
2037       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2038       At least avoid calling MatSetValues() and the implied searches?
2039     */
2040 
2041     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2042 #if defined(PETSC_USE_CTABLE)
2043       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2044 #else
2045       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2046       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2047       if (aij->B) {
2048         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2049       }
2050 #endif
2051       ngcol = 0;
2052       if (aij->lvec) {
2053 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2054       }
2055       if (aij->garray) {
2056 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2057 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2058       }
2059       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2060       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2061     }
2062     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2063       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2064     }
2065     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2066       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2067     }
2068   }
2069   Bdisassembled = PETSC_FALSE;
2070   if (!aij->B) {
2071     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2072     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2073     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2074     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2075     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2076     Bdisassembled = PETSC_TRUE;
2077   }
2078   if (B) {
2079     Baij = (Mat_SeqAIJ*)(B->data);
2080     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2081       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2082       for (i=0; i<B->rmap->n; i++) {
2083 	nz[i] = Baij->i[i+1] - Baij->i[i];
2084       }
2085       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2086       ierr = PetscFree(nz);CHKERRQ(ierr);
2087     }
2088 
2089     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
2090     shift = rend-rstart;
2091     count = 0;
2092     rowindices = NULL;
2093     colindices = NULL;
2094     if (rowemb) {
2095       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
2096     }
2097     if (ocolemb) {
2098       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
2099     }
2100     for (i=0; i<B->rmap->n; i++) {
2101       PetscInt row;
2102       row = i;
2103       if (rowindices) row = rowindices[i];
2104       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2105 	col  = Baij->j[count];
2106 	if (colindices) col = colindices[col];
2107 	if (Bdisassembled && col>=rstart) col += shift;
2108 	v    = Baij->a[count];
2109 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
2110 	++count;
2111       }
2112     }
2113     /* No assembly for aij->B is necessary. */
2114     /* FIXME: set aij->B's nonzerostate correctly. */
2115   } else {
2116     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
2117   }
2118   C->preallocated  = PETSC_TRUE;
2119   C->was_assembled = PETSC_FALSE;
2120   C->assembled     = PETSC_FALSE;
2121    /*
2122       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2123       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2124    */
2125   PetscFunctionReturn(0);
2126 }
2127 
2128 #undef __FUNCT__
2129 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
2130 /*
2131   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2132  */
2133 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
2134 {
2135   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
2136 
2137   PetscFunctionBegin;
2138   PetscValidPointer(A,2);
2139   PetscValidPointer(B,3);
2140   /* FIXME: make sure C is assembled */
2141   *A = aij->A;
2142   *B = aij->B;
2143   /* Note that we don't incref *A and *B, so be careful! */
2144   PetscFunctionReturn(0);
2145 }
2146 
2147 /*
2148   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2149   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2150 */
2151 #undef __FUNCT__
2152 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
2153 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
2154                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
2155 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
2156 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
2157 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
2158 {
2159   PetscErrorCode ierr;
2160   PetscMPIInt    isize,flag;
2161   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
2162   Mat            *A,*B;
2163   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
2164 
2165   PetscFunctionBegin;
2166   if (!ismax) PetscFunctionReturn(0);
2167 
2168   for (i = 0, cismax = 0; i < ismax; ++i) {
2169     PetscMPIInt isize;
2170     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
2171     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2172     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
2173     if (isize > 1) ++cismax;
2174   }
2175   /*
2176      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
2177      ispar counts the number of parallel ISs across C's comm.
2178   */
2179   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2180   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
2181     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2182     PetscFunctionReturn(0);
2183   }
2184 
2185   /* if (ispar) */
2186   /*
2187     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
2188     These are used to extract the off-diag portion of the resulting parallel matrix.
2189     The row IS for the off-diag portion is the same as for the diag portion,
2190     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
2191   */
2192   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
2193   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
2194   for (i = 0, ii = 0; i < ismax; ++i) {
2195     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2196     if (isize > 1) {
2197       /*
2198 	 TODO: This is the part that's ***NOT SCALABLE***.
2199 	 To fix this we need to extract just the indices of C's nonzero columns
2200 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
2201 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
2202 	 to be done without serializing on the IS list, so, most likely, it is best
2203 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
2204       */
2205       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
2206       /* Now we have to
2207 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
2208 	     were sorted on each rank, concatenated they might no longer be sorted;
2209 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
2210 	     indices in the nondecreasing order to the original index positions.
2211 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
2212       */
2213       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
2214       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
2215       ++ii;
2216     }
2217   }
2218   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
2219   for (i = 0, ii = 0; i < ismax; ++i) {
2220     PetscInt       j,issize;
2221     const PetscInt *indices;
2222 
2223     /*
2224        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
2225      */
2226     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
2227     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
2228     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
2229     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
2230     for (j = 1; j < issize; ++j) {
2231       if (indices[j] == indices[j-1]) {
2232 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2233       }
2234     }
2235     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
2236 
2237 
2238     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
2239     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
2240     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
2241     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
2242     for (j = 1; j < issize; ++j) {
2243       if (indices[j-1] == indices[j]) {
2244 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2245       }
2246     }
2247     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
2248     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2249     if (isize > 1) {
2250       cisrow[ii] = isrow[i];
2251       ++ii;
2252     }
2253   }
2254   /*
2255     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2256     array of sequential matrices underlying the resulting parallel matrices.
2257     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
2258     contain duplicates.
2259 
2260     There are as many diag matrices as there are original index sets. There are only as many parallel
2261     and off-diag matrices, as there are parallel (comm size > 1) index sets.
2262 
2263     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
2264     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
2265       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
2266       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
2267       with A[i] and B[ii] extracted from the corresponding MPI submat.
2268     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
2269       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
2270       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
2271       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
2272       values into A[i] and B[ii] sitting inside the corresponding submat.
2273     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
2274       A[i], B[ii], AA[i] or BB[ii] matrices.
2275   */
2276   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
2277   if (scall == MAT_INITIAL_MATRIX) {
2278     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
2279     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
2280   } else {
2281     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
2282     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
2283     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
2284     for (i = 0, ii = 0; i < ismax; ++i) {
2285       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2286       if (isize > 1) {
2287 	Mat AA,BB;
2288 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
2289 	if (!isrow_p[i] && !iscol_p[i]) {
2290 	  A[i] = AA;
2291 	} else {
2292 	  /* TODO: extract A[i] composed on AA. */
2293 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
2294 	}
2295 	if (!isrow_p[i] && !ciscol_p[ii]) {
2296 	  B[ii] = BB;
2297 	} else {
2298 	  /* TODO: extract B[ii] composed on BB. */
2299 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
2300 	}
2301 	++ii;
2302       } else {
2303 	if (!isrow_p[i] && !iscol_p[i]) {
2304 	  A[i] = (*submat)[i];
2305 	} else {
2306 	  /* TODO: extract A[i] composed on (*submat)[i]. */
2307 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
2308 	}
2309       }
2310     }
2311   }
2312   /* Now obtain the sequential A and B submatrices separately. */
2313   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
2314   /* I did not figure out a good way to fix it right now.
2315    * Local column size of B[i] is different from the size of ciscol[i].
2316    * B[i]'s size is finally determined by MatAssembly, while
2317    * ciscol[i] is computed as the complement of iscol[i].
2318    * It is better to keep only nonzero indices when computing
2319    * the complement ciscol[i].
2320    * */
2321   if(scall==MAT_REUSE_MATRIX){
2322 	for(i=0; i<cismax; i++){
2323 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
2324 	  B[i]->cmap->n = ciscol_localsize;
2325 	}
2326   }
2327   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
2328   /*
2329     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
2330     matrices A & B have been extracted directly into the parallel matrices containing them, or
2331     simply into the sequential matrix identical with the corresponding A (if isize == 1).
2332     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
2333     to have the same sparsity pattern.
2334     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
2335     must be constructed for C. This is done by setseqmat(s).
2336   */
2337   for (i = 0, ii = 0; i < ismax; ++i) {
2338     /*
2339        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
2340        That way we can avoid sorting and computing permutations when reusing.
2341        To this end:
2342         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
2343 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
2344 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
2345     */
2346     MatStructure pattern;
2347     if (scall == MAT_INITIAL_MATRIX) {
2348       pattern = DIFFERENT_NONZERO_PATTERN;
2349     } else {
2350       pattern = SAME_NONZERO_PATTERN;
2351     }
2352     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
2353     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
2354     if (isize > 1) {
2355       if (scall == MAT_INITIAL_MATRIX) {
2356 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
2357 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2358 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
2359 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
2360 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
2361       }
2362       /*
2363 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
2364       */
2365       {
2366 	Mat AA,BB;
2367 	AA = NULL;
2368 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
2369 	BB = NULL;
2370 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
2371 	if (AA || BB) {
2372 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
2373 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2374 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2375 	}
2376 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
2377 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
2378 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
2379 	}
2380 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
2381 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
2382 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
2383 	}
2384       }
2385       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
2386       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
2387       ++ii;
2388     } else { /* if (isize == 1) */
2389       if (scall == MAT_INITIAL_MATRIX) {
2390 	if (isrow_p[i] || iscol_p[i]) {
2391 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
2392 	} else (*submat)[i] = A[i];
2393       }
2394       if (isrow_p[i] || iscol_p[i]) {
2395 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
2396 	/* Otherwise A is extracted straight into (*submats)[i]. */
2397 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
2398 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
2399       }
2400     }
2401     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
2402     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
2403   }
2404   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
2405   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
2406   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
2407   ierr = PetscFree(A);CHKERRQ(ierr);
2408   ierr = PetscFree(B);CHKERRQ(ierr);
2409   PetscFunctionReturn(0);
2410 }
2411 
2412 
2413 
2414 #undef __FUNCT__
2415 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
2416 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2417 {
2418   PetscErrorCode ierr;
2419 
2420   PetscFunctionBegin;
2421   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
2422   PetscFunctionReturn(0);
2423 }
2424