xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision f489ac74240ff84cac916603ef355e2f8f5405a9)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscInt       **iindices;
58   PetscLayout    rmap;
59   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
60   PetscSF        sf;
61   PetscSFNode    *remote;
62 
63   PetscFunctionBegin;
64   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
65   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
66   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
67   /* get row map to determine where rows should be going */
68   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
69   /* retrieve IS data and put all together so that we
70    * can optimize communication
71    *  */
72   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
73   for (i=0,tlength=0; i<nidx; i++){
74     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
75     tlength += length[i];
76     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
77   }
78   /* find these rows on remote processors */
79   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
80   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
81   nrrows = 0;
82   for (i=0; i<nidx; i++){
83     length_i     = length[i];
84     indices_i    = indices[i];
85     for (j=0; j<length_i; j++){
86       owner = -1;
87       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
88       /* remote processors */
89       if (owner != rank){
90         tosizes_temp[owner]++; /* number of rows to owner */
91         rrow_ranks[nrrows]  = owner; /* processor */
92         rrow_isids[nrrows]   = i; /* is id */
93         remoterows[nrrows++] = indices_i[j]; /* row */
94       }
95     }
96     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
97   }
98   iindices = (PetscInt**)indices; /* required because PetscFreeN() cannot work on const */
99   ierr = PetscFree2(iindices,length);CHKERRQ(ierr);
100   /* test if we need to exchange messages
101    * generally speaking, we do not need to exchange
102    * data when overlap is 1
103    * */
104   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
105   /* we do not have any messages
106    * It usually corresponds to overlap 1
107    * */
108   if (!reducednrrows){
109     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
110     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
111     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
112     PetscFunctionReturn(0);
113   }
114   nto = 0;
115   /* send sizes and ranks for building a two-sided communcation */
116   for (i=0; i<size; i++){
117    if (tosizes_temp[i]){
118      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
119      tosizes_temp[i] = nto; /* a map from processor to index */
120      toranks[nto++]  = i; /* processor */
121    }
122   }
123   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
124   for (i=0; i<nto; i++){
125     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
126     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
127   }
128   /* send information to other processors */
129   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
130   nrecvrows = 0;
131   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
132   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
133   nrecvrows = 0;
134   for (i=0; i<nfrom; i++){
135     for (j=0; j<fromsizes[2*i]; j++){
136       remote[nrecvrows].rank    = fromranks[i];
137       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
138     }
139   }
140   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
141   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
142   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
143   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
144   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
145   /* message pair <no of is, row>  */
146   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
147   for (i=0; i<nrrows; i++){
148     owner = rrow_ranks[i]; /* processor */
149     j     = tosizes_temp[owner]; /* index */
150     todata[toffsets[j]++] = rrow_isids[i];
151     todata[toffsets[j]++] = remoterows[i];
152   }
153   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
154   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
155   ierr = PetscFree(toffsets);CHKERRQ(ierr);
156   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
157   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
158   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
159   /* send rows belonging to the remote so that then we could get the overlapping data back */
160   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
161   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
162   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
163   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
164   ierr = PetscFree(fromranks);CHKERRQ(ierr);
165   nrecvrows = 0;
166   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
167   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
168   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
169   nrecvrows = 0;
170   for (i=0; i<nto; i++){
171     for (j=0; j<tosizes[2*i]; j++){
172       remote[nrecvrows].rank    = toranks[i];
173       remote[nrecvrows++].index = tosizes[2*i+1]+j;
174     }
175   }
176   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
177   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
178   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
179   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
180   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
181   /* overlap communication and computation */
182   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
184   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
185   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
186   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
187   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
188   ierr = PetscFree(toranks);CHKERRQ(ierr);
189   ierr = PetscFree(tosizes);CHKERRQ(ierr);
190   ierr = PetscFree(todata);CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 }
193 
194 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
195 {
196   PetscInt         *isz,isz_i,i,j,is_id, data_size;
197   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
198   const PetscInt   *indices_i_temp;
199   PetscErrorCode    ierr;
200 
201   PetscFunctionBegin;
202   max_lsize = 0;
203   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
204   for (i=0; i<nidx; i++){
205     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
206     max_lsize = lsize>max_lsize ? lsize:max_lsize;
207     isz[i]    = lsize;
208   }
209   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
213     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
214     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
215   }
216   /* retrieve information to get row id and its overlap */
217   for (i=0; i<nrecvs; ){
218     is_id      = recvdata[i++];
219     data_size  = recvdata[i++];
220     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
221     isz_i      = isz[is_id];
222     for (j=0; j< data_size; j++){
223       col = recvdata[i++];
224       indices_i[isz_i++] = col;
225     }
226     isz[is_id] = isz_i;
227   }
228   /* remove duplicate entities */
229   for (i=0; i<nidx; i++){
230     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
231     isz_i      = isz[i];
232     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
233     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
234   }
235   ierr = PetscFree(isz);CHKERRQ(ierr);
236   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
237   PetscFunctionReturn(0);
238 }
239 
240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
241 {
242   PetscLayout       rmap,cmap;
243   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
244   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
245   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
246   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
247   Mat               amat,bmat;
248   PetscMPIInt       rank;
249   PetscBool         done;
250   MPI_Comm          comm;
251   PetscErrorCode    ierr;
252 
253   PetscFunctionBegin;
254   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
255   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
256   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
257   /* Even if the mat is symmetric, we still assume it is not symmetric */
258   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   /* total number of nonzero values is used to estimate the memory usage in the next step */
263   tnz  = ai[an]+bi[bn];
264   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
265   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
267   /* to find the longest message */
268   max_fszs = 0;
269   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
270   /* better way to estimate number of nonzero in the mat??? */
271   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
272   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
273   rows_pos  = 0;
274   totalrows = 0;
275   for (i=0; i<nfrom; i++){
276     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
277     /* group data together */
278     for (j=0; j<fromsizes[2*i]; j+=2){
279       is_id                       = fromrows[rows_pos++];/* no of is */
280       rows_i                      = rows_data[is_id];
281       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
282     }
283     /* estimate a space to avoid multiple allocations  */
284     for (j=0; j<nidx; j++){
285       indvc_ij = 0;
286       rows_i   = rows_data[j];
287       for (l=0; l<rows_pos_i[j]; l++){
288         row    = rows_i[l]-rstart;
289         start  = ai[row];
290         end    = ai[row+1];
291         for (k=start; k<end; k++){ /* Amat */
292           col = aj[k] + cstart;
293           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
294         }
295         start = bi[row];
296         end   = bi[row+1];
297         for (k=start; k<end; k++) { /* Bmat */
298           col = gcols[bj[k]];
299           indices_tmp[indvc_ij++] = col;
300         }
301       }
302       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
303       indv_counts[i*nidx+j] = indvc_ij;
304       totalrows            += indvc_ij;
305     }
306   }
307   /* message triple <no of is, number of rows, rows> */
308   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
309   totalrows = 0;
310   rows_pos  = 0;
311   /* use this code again */
312   for (i=0;i<nfrom;i++){
313     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
314     for (j=0; j<fromsizes[2*i]; j+=2){
315       is_id                       = fromrows[rows_pos++];
316       rows_i                      = rows_data[is_id];
317       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
318     }
319     /* add data  */
320     for (j=0; j<nidx; j++){
321       if (!indv_counts[i*nidx+j]) continue;
322       indvc_ij = 0;
323       sbdata[totalrows++] = j;
324       sbdata[totalrows++] = indv_counts[i*nidx+j];
325       sbsizes[2*i]       += 2;
326       rows_i              = rows_data[j];
327       for (l=0; l<rows_pos_i[j]; l++){
328         row   = rows_i[l]-rstart;
329         start = ai[row];
330         end   = ai[row+1];
331         for (k=start; k<end; k++){ /* Amat */
332           col = aj[k] + cstart;
333           indices_tmp[indvc_ij++] = col;
334         }
335         start = bi[row];
336         end   = bi[row+1];
337         for (k=start; k<end; k++) { /* Bmat */
338           col = gcols[bj[k]];
339           indices_tmp[indvc_ij++] = col;
340         }
341       }
342       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
343       sbsizes[2*i]  += indvc_ij;
344       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
345       totalrows += indvc_ij;
346     }
347   }
348   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
349   for (i=0; i<nfrom; i++){
350     offsets[i+1]   = offsets[i] + sbsizes[2*i];
351     sbsizes[2*i+1] = offsets[i];
352   }
353   ierr = PetscFree(offsets);CHKERRQ(ierr);
354   if (sbrowsizes) *sbrowsizes = sbsizes;
355   if (sbrows) *sbrows = sbdata;
356   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
357   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
359   PetscFunctionReturn(0);
360 }
361 
362 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
363 {
364   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
365   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
366   PetscInt          lsize,lsize_tmp,owner;
367   PetscMPIInt       rank;
368   Mat               amat,bmat;
369   PetscBool         done;
370   PetscLayout       cmap,rmap;
371   MPI_Comm          comm;
372   PetscErrorCode    ierr;
373 
374   PetscFunctionBegin;
375   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
376   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
377   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
378   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
381   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
382   /* is it a safe way to compute number of nonzero values ? */
383   tnz  = ai[an]+bi[bn];
384   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
385   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
387   /* it is a better way to estimate memory than the old implementation
388    * where global size of matrix is used
389    * */
390   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
391   for (i=0; i<nidx; i++) {
392     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
393     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
394     lsize_tmp = 0;
395     for (j=0; j<lsize; j++) {
396       owner = -1;
397       row   = indices[j];
398       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
399       if (owner != rank) continue;
400       /* local number */
401       row  -= rstart;
402       start = ai[row];
403       end   = ai[row+1];
404       for (k=start; k<end; k++) { /* Amat */
405         col = aj[k] + cstart;
406         indices_temp[lsize_tmp++] = col;
407       }
408       start = bi[row];
409       end   = bi[row+1];
410       for (k=start; k<end; k++) { /* Bmat */
411         col = gcols[bj[k]];
412         indices_temp[lsize_tmp++] = col;
413       }
414     }
415    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
416    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
417    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
418    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
419   }
420   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
421   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
422   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
423   PetscFunctionReturn(0);
424 }
425 
426 
427 /*
428   Sample message format:
429   If a processor A wants processor B to process some elements corresponding
430   to index sets is[1],is[5]
431   mesg [0] = 2   (no of index sets in the mesg)
432   -----------
433   mesg [1] = 1 => is[1]
434   mesg [2] = sizeof(is[1]);
435   -----------
436   mesg [3] = 5  => is[5]
437   mesg [4] = sizeof(is[5]);
438   -----------
439   mesg [5]
440   mesg [n]  datas[1]
441   -----------
442   mesg[n+1]
443   mesg[m]  data(is[5])
444   -----------
445 
446   Notes:
447   nrqs - no of requests sent (or to be sent out)
448   nrqr - no of requests recieved (which have to be or which have been processed
449 */
450 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
451 {
452   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
453   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
454   const PetscInt **idx,*idx_i;
455   PetscInt       **iidx;
456   PetscInt       *n,**data,len;
457 #if defined(PETSC_USE_CTABLE)
458   PetscTable     *table_data,table_data_i;
459   PetscInt       *tdata,tcount,tcount_max;
460 #else
461   PetscInt       *data_i,*d_p;
462 #endif
463   PetscErrorCode ierr;
464   PetscMPIInt    size,rank,tag1,tag2;
465   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
466   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
467   PetscBT        *table;
468   MPI_Comm       comm;
469   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
470   MPI_Status     *s_status,*recv_status;
471   char           *t_p;
472 
473   PetscFunctionBegin;
474   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
475   size = c->size;
476   rank = c->rank;
477   M    = C->rmap->N;
478 
479   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
480   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
481 
482   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
483 
484   for (i=0; i<imax; i++) {
485     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
486     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
487   }
488 
489   /* evaluate communication - mesg to who,length of mesg, and buffer space
490      required. Based on this, buffers are allocated, and data copied into them  */
491   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
492   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
493   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
495   for (i=0; i<imax; i++) {
496     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
497     idx_i = idx[i];
498     len   = n[i];
499     for (j=0; j<len; j++) {
500       row = idx_i[j];
501       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
502       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
503       w4[proc]++;
504     }
505     for (j=0; j<size; j++) {
506       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
507     }
508   }
509 
510   nrqs     = 0;              /* no of outgoing messages */
511   msz      = 0;              /* total mesg length (for all proc */
512   w1[rank] = 0;              /* no mesg sent to intself */
513   w3[rank] = 0;
514   for (i=0; i<size; i++) {
515     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
516   }
517   /* pa - is list of processors to communicate with */
518   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
519   for (i=0,j=0; i<size; i++) {
520     if (w1[i]) {pa[j] = i; j++;}
521   }
522 
523   /* Each message would have a header = 1 + 2*(no of IS) + data */
524   for (i=0; i<nrqs; i++) {
525     j      = pa[i];
526     w1[j] += w2[j] + 2*w3[j];
527     msz   += w1[j];
528   }
529 
530   /* Determine the number of messages to expect, their lengths, from from-ids */
531   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
532   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
533 
534   /* Now post the Irecvs corresponding to these messages */
535   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
536 
537   /* Allocate Memory for outgoing messages */
538   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
539   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
540   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
541 
542   {
543     PetscInt *iptr = tmp,ict  = 0;
544     for (i=0; i<nrqs; i++) {
545       j         = pa[i];
546       iptr     +=  ict;
547       outdat[j] = iptr;
548       ict       = w1[j];
549     }
550   }
551 
552   /* Form the outgoing messages */
553   /* plug in the headers */
554   for (i=0; i<nrqs; i++) {
555     j            = pa[i];
556     outdat[j][0] = 0;
557     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
558     ptr[j]       = outdat[j] + 2*w3[j] + 1;
559   }
560 
561   /* Memory for doing local proc's work */
562   {
563     PetscInt M_BPB_imax = 0;
564 #if defined(PETSC_USE_CTABLE)
565     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
566     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
567     for (i=0; i<imax; i++) {
568       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
569     }
570     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
571     for (i=0; i<imax; i++) {
572       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
573     }
574 #else
575     PetscInt Mimax = 0;
576     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
577     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
578     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
579     for (i=0; i<imax; i++) {
580       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
581       data[i]  = d_p + M*i;
582     }
583 #endif
584   }
585 
586   /* Parse the IS and update local tables and the outgoing buf with the data */
587   {
588     PetscInt n_i,isz_i,*outdat_j,ctr_j;
589     PetscBT  table_i;
590 
591     for (i=0; i<imax; i++) {
592       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
593       n_i     = n[i];
594       table_i = table[i];
595       idx_i   = idx[i];
596 #if defined(PETSC_USE_CTABLE)
597       table_data_i = table_data[i];
598 #else
599       data_i  = data[i];
600 #endif
601       isz_i   = isz[i];
602       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
603         row  = idx_i[j];
604         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
605         if (proc != rank) { /* copy to the outgoing buffer */
606           ctr[proc]++;
607           *ptr[proc] = row;
608           ptr[proc]++;
609         } else if (!PetscBTLookupSet(table_i,row)) {
610 #if defined(PETSC_USE_CTABLE)
611           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
612 #else
613           data_i[isz_i] = row; /* Update the local table */
614 #endif
615           isz_i++;
616         }
617       }
618       /* Update the headers for the current IS */
619       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
620         if ((ctr_j = ctr[j])) {
621           outdat_j        = outdat[j];
622           k               = ++outdat_j[0];
623           outdat_j[2*k]   = ctr_j;
624           outdat_j[2*k-1] = i;
625         }
626       }
627       isz[i] = isz_i;
628     }
629   }
630 
631   /*  Now  post the sends */
632   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
633   for (i=0; i<nrqs; ++i) {
634     j    = pa[i];
635     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
636   }
637 
638   /* No longer need the original indices */
639   for (i=0; i<imax; ++i) {
640     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
641   }
642   iidx = (PetscInt**)idx; /* required because PetscFreeN() cannot work on const */
643   ierr = PetscFree2(iidx,n);CHKERRQ(ierr);
644 
645   for (i=0; i<imax; ++i) {
646     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
647   }
648 
649   /* Do Local work */
650 #if defined(PETSC_USE_CTABLE)
651   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
652 #else
653   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
654 #endif
655 
656   /* Receive messages */
657   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
658   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
659 
660   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
661   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
662 
663   /* Phase 1 sends are complete - deallocate buffers */
664   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
665   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
666 
667   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
668   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
669   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
670   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
671   ierr = PetscFree(rbuf);CHKERRQ(ierr);
672 
673 
674   /* Send the data back */
675   /* Do a global reduction to know the buffer space req for incoming messages */
676   {
677     PetscMPIInt *rw1;
678 
679     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
680 
681     for (i=0; i<nrqr; ++i) {
682       proc = recv_status[i].MPI_SOURCE;
683 
684       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
685       rw1[proc] = isz1[i];
686     }
687     ierr = PetscFree(onodes1);CHKERRQ(ierr);
688     ierr = PetscFree(olengths1);CHKERRQ(ierr);
689 
690     /* Determine the number of messages to expect, their lengths, from from-ids */
691     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
692     ierr = PetscFree(rw1);CHKERRQ(ierr);
693   }
694   /* Now post the Irecvs corresponding to these messages */
695   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
696 
697   /* Now  post the sends */
698   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
699   for (i=0; i<nrqr; ++i) {
700     j    = recv_status[i].MPI_SOURCE;
701     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
702   }
703 
704   /* receive work done on other processors */
705   {
706     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
707     PetscMPIInt idex;
708     PetscBT     table_i;
709     MPI_Status  *status2;
710 
711     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
712     for (i=0; i<nrqs; ++i) {
713       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
714       /* Process the message */
715       rbuf2_i = rbuf2[idex];
716       ct1     = 2*rbuf2_i[0]+1;
717       jmax    = rbuf2[idex][0];
718       for (j=1; j<=jmax; j++) {
719         max     = rbuf2_i[2*j];
720         is_no   = rbuf2_i[2*j-1];
721         isz_i   = isz[is_no];
722         table_i = table[is_no];
723 #if defined(PETSC_USE_CTABLE)
724         table_data_i = table_data[is_no];
725 #else
726         data_i  = data[is_no];
727 #endif
728         for (k=0; k<max; k++,ct1++) {
729           row = rbuf2_i[ct1];
730           if (!PetscBTLookupSet(table_i,row)) {
731 #if defined(PETSC_USE_CTABLE)
732             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
733 #else
734             data_i[isz_i] = row;
735 #endif
736             isz_i++;
737           }
738         }
739         isz[is_no] = isz_i;
740       }
741     }
742 
743     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
744     ierr = PetscFree(status2);CHKERRQ(ierr);
745   }
746 
747 #if defined(PETSC_USE_CTABLE)
748   tcount_max = 0;
749   for (i=0; i<imax; ++i) {
750     table_data_i = table_data[i];
751     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
752     if (tcount_max < tcount) tcount_max = tcount;
753   }
754   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
755 #endif
756 
757   for (i=0; i<imax; ++i) {
758 #if defined(PETSC_USE_CTABLE)
759     PetscTablePosition tpos;
760     table_data_i = table_data[i];
761 
762     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
763     while (tpos) {
764       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
765       tdata[--j] = --k;
766     }
767     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
768 #else
769     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
770 #endif
771   }
772 
773   ierr = PetscFree(onodes2);CHKERRQ(ierr);
774   ierr = PetscFree(olengths2);CHKERRQ(ierr);
775 
776   ierr = PetscFree(pa);CHKERRQ(ierr);
777   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
778   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
779   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
780   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
781   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
782   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
783   ierr = PetscFree(s_status);CHKERRQ(ierr);
784   ierr = PetscFree(recv_status);CHKERRQ(ierr);
785   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
786   ierr = PetscFree(xdata);CHKERRQ(ierr);
787   ierr = PetscFree(isz1);CHKERRQ(ierr);
788 #if defined(PETSC_USE_CTABLE)
789   for (i=0; i<imax; i++) {
790     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
791   }
792   ierr = PetscFree(table_data);CHKERRQ(ierr);
793   ierr = PetscFree(tdata);CHKERRQ(ierr);
794   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
795 #else
796   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
797 #endif
798   PetscFunctionReturn(0);
799 }
800 
801 /*
802    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
803        the work on the local processor.
804 
805      Inputs:
806       C      - MAT_MPIAIJ;
807       imax - total no of index sets processed at a time;
808       table  - an array of char - size = m bits.
809 
810      Output:
811       isz    - array containing the count of the solution elements corresponding
812                to each index set;
813       data or table_data  - pointer to the solutions
814 */
815 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
816 {
817   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
818   Mat        A  = c->A,B = c->B;
819   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
820   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
821   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
822   PetscBT    table_i;
823 #if defined(PETSC_USE_CTABLE)
824   PetscTable         table_data_i;
825   PetscErrorCode     ierr;
826   PetscTablePosition tpos;
827   PetscInt           tcount,*tdata;
828 #else
829   PetscInt           *data_i;
830 #endif
831 
832   PetscFunctionBegin;
833   rstart = C->rmap->rstart;
834   cstart = C->cmap->rstart;
835   ai     = a->i;
836   aj     = a->j;
837   bi     = b->i;
838   bj     = b->j;
839   garray = c->garray;
840 
841   for (i=0; i<imax; i++) {
842 #if defined(PETSC_USE_CTABLE)
843     /* copy existing entries of table_data_i into tdata[] */
844     table_data_i = table_data[i];
845     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
846     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
847 
848     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
849     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
850     while (tpos) {
851       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
852       tdata[--j] = --row;
853       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
854     }
855 #else
856     data_i  = data[i];
857 #endif
858     table_i = table[i];
859     isz_i   = isz[i];
860     max     = isz[i];
861 
862     for (j=0; j<max; j++) {
863 #if defined(PETSC_USE_CTABLE)
864       row   = tdata[j] - rstart;
865 #else
866       row   = data_i[j] - rstart;
867 #endif
868       start = ai[row];
869       end   = ai[row+1];
870       for (k=start; k<end; k++) { /* Amat */
871         val = aj[k] + cstart;
872         if (!PetscBTLookupSet(table_i,val)) {
873 #if defined(PETSC_USE_CTABLE)
874           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
875 #else
876           data_i[isz_i] = val;
877 #endif
878           isz_i++;
879         }
880       }
881       start = bi[row];
882       end   = bi[row+1];
883       for (k=start; k<end; k++) { /* Bmat */
884         val = garray[bj[k]];
885         if (!PetscBTLookupSet(table_i,val)) {
886 #if defined(PETSC_USE_CTABLE)
887           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
888 #else
889           data_i[isz_i] = val;
890 #endif
891           isz_i++;
892         }
893       }
894     }
895     isz[i] = isz_i;
896 
897 #if defined(PETSC_USE_CTABLE)
898     ierr = PetscFree(tdata);CHKERRQ(ierr);
899 #endif
900   }
901   PetscFunctionReturn(0);
902 }
903 
904 /*
905       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
906          and return the output
907 
908          Input:
909            C    - the matrix
910            nrqr - no of messages being processed.
911            rbuf - an array of pointers to the recieved requests
912 
913          Output:
914            xdata - array of messages to be sent back
915            isz1  - size of each message
916 
917   For better efficiency perhaps we should malloc separately each xdata[i],
918 then if a remalloc is required we need only copy the data for that one row
919 rather then all previous rows as it is now where a single large chunck of
920 memory is used.
921 
922 */
923 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
924 {
925   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
926   Mat            A  = c->A,B = c->B;
927   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
928   PetscErrorCode ierr;
929   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
930   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
931   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
932   PetscInt       *rbuf_i,kmax,rbuf_0;
933   PetscBT        xtable;
934 
935   PetscFunctionBegin;
936   m      = C->rmap->N;
937   rstart = C->rmap->rstart;
938   cstart = C->cmap->rstart;
939   ai     = a->i;
940   aj     = a->j;
941   bi     = b->i;
942   bj     = b->j;
943   garray = c->garray;
944 
945 
946   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
947     rbuf_i =  rbuf[i];
948     rbuf_0 =  rbuf_i[0];
949     ct    += rbuf_0;
950     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
951   }
952 
953   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
954   else max1 = 1;
955   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
956   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
957   ++no_malloc;
958   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
959   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
960 
961   ct3 = 0;
962   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
963     rbuf_i =  rbuf[i];
964     rbuf_0 =  rbuf_i[0];
965     ct1    =  2*rbuf_0+1;
966     ct2    =  ct1;
967     ct3   += ct1;
968     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
969       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
970       oct2 = ct2;
971       kmax = rbuf_i[2*j];
972       for (k=0; k<kmax; k++,ct1++) {
973         row = rbuf_i[ct1];
974         if (!PetscBTLookupSet(xtable,row)) {
975           if (!(ct3 < mem_estimate)) {
976             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
977             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
978             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
979             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
980             xdata[0]     = tmp;
981             mem_estimate = new_estimate; ++no_malloc;
982             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
983           }
984           xdata[i][ct2++] = row;
985           ct3++;
986         }
987       }
988       for (k=oct2,max2=ct2; k<max2; k++) {
989         row   = xdata[i][k] - rstart;
990         start = ai[row];
991         end   = ai[row+1];
992         for (l=start; l<end; l++) {
993           val = aj[l] + cstart;
994           if (!PetscBTLookupSet(xtable,val)) {
995             if (!(ct3 < mem_estimate)) {
996               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
997               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
998               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
999               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1000               xdata[0]     = tmp;
1001               mem_estimate = new_estimate; ++no_malloc;
1002               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1003             }
1004             xdata[i][ct2++] = val;
1005             ct3++;
1006           }
1007         }
1008         start = bi[row];
1009         end   = bi[row+1];
1010         for (l=start; l<end; l++) {
1011           val = garray[bj[l]];
1012           if (!PetscBTLookupSet(xtable,val)) {
1013             if (!(ct3 < mem_estimate)) {
1014               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1015               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1016               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1017               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1018               xdata[0]     = tmp;
1019               mem_estimate = new_estimate; ++no_malloc;
1020               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1021             }
1022             xdata[i][ct2++] = val;
1023             ct3++;
1024           }
1025         }
1026       }
1027       /* Update the header*/
1028       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1029       xdata[i][2*j-1] = rbuf_i[2*j-1];
1030     }
1031     xdata[i][0] = rbuf_0;
1032     xdata[i+1]  = xdata[i] + ct2;
1033     isz1[i]     = ct2; /* size of each message */
1034   }
1035   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1036   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1037   PetscFunctionReturn(0);
1038 }
1039 /* -------------------------------------------------------------------------*/
1040 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1041 /*
1042     Every processor gets the entire matrix
1043 */
1044 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1045 {
1046   Mat            B;
1047   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1048   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1049   PetscErrorCode ierr;
1050   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1051   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1052   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1053   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1054 
1055   PetscFunctionBegin;
1056   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1057   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1058 
1059   if (scall == MAT_INITIAL_MATRIX) {
1060     /* ----------------------------------------------------------------
1061          Tell every processor the number of nonzeros per row
1062     */
1063     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1064     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1065       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1066     }
1067     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1068     for (i=0; i<size; i++) {
1069       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1070       displs[i]     = A->rmap->range[i];
1071     }
1072 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1073     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1074 #else
1075     sendcount = A->rmap->rend - A->rmap->rstart;
1076     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1077 #endif
1078     /* ---------------------------------------------------------------
1079          Create the sequential matrix of the same type as the local block diagonal
1080     */
1081     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1082     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1083     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1084     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1085     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1086     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1087     **Bin = B;
1088     b     = (Mat_SeqAIJ*)B->data;
1089 
1090     /*--------------------------------------------------------------------
1091        Copy my part of matrix column indices over
1092     */
1093     sendcount  = ad->nz + bd->nz;
1094     jsendbuf   = b->j + b->i[rstarts[rank]];
1095     a_jsendbuf = ad->j;
1096     b_jsendbuf = bd->j;
1097     n          = A->rmap->rend - A->rmap->rstart;
1098     cnt        = 0;
1099     for (i=0; i<n; i++) {
1100 
1101       /* put in lower diagonal portion */
1102       m = bd->i[i+1] - bd->i[i];
1103       while (m > 0) {
1104         /* is it above diagonal (in bd (compressed) numbering) */
1105         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1106         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1107         m--;
1108       }
1109 
1110       /* put in diagonal portion */
1111       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1112         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1113       }
1114 
1115       /* put in upper diagonal portion */
1116       while (m-- > 0) {
1117         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1118       }
1119     }
1120     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1121 
1122     /*--------------------------------------------------------------------
1123        Gather all column indices to all processors
1124     */
1125     for (i=0; i<size; i++) {
1126       recvcounts[i] = 0;
1127       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1128         recvcounts[i] += lens[j];
1129       }
1130     }
1131     displs[0] = 0;
1132     for (i=1; i<size; i++) {
1133       displs[i] = displs[i-1] + recvcounts[i-1];
1134     }
1135 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1136     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1137 #else
1138     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1139 #endif
1140     /*--------------------------------------------------------------------
1141         Assemble the matrix into useable form (note numerical values not yet set)
1142     */
1143     /* set the b->ilen (length of each row) values */
1144     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1145     /* set the b->i indices */
1146     b->i[0] = 0;
1147     for (i=1; i<=A->rmap->N; i++) {
1148       b->i[i] = b->i[i-1] + lens[i-1];
1149     }
1150     ierr = PetscFree(lens);CHKERRQ(ierr);
1151     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1152     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1153 
1154   } else {
1155     B = **Bin;
1156     b = (Mat_SeqAIJ*)B->data;
1157   }
1158 
1159   /*--------------------------------------------------------------------
1160        Copy my part of matrix numerical values into the values location
1161   */
1162   if (flag == MAT_GET_VALUES) {
1163     sendcount = ad->nz + bd->nz;
1164     sendbuf   = b->a + b->i[rstarts[rank]];
1165     a_sendbuf = ad->a;
1166     b_sendbuf = bd->a;
1167     b_sendj   = bd->j;
1168     n         = A->rmap->rend - A->rmap->rstart;
1169     cnt       = 0;
1170     for (i=0; i<n; i++) {
1171 
1172       /* put in lower diagonal portion */
1173       m = bd->i[i+1] - bd->i[i];
1174       while (m > 0) {
1175         /* is it above diagonal (in bd (compressed) numbering) */
1176         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1177         sendbuf[cnt++] = *b_sendbuf++;
1178         m--;
1179         b_sendj++;
1180       }
1181 
1182       /* put in diagonal portion */
1183       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1184         sendbuf[cnt++] = *a_sendbuf++;
1185       }
1186 
1187       /* put in upper diagonal portion */
1188       while (m-- > 0) {
1189         sendbuf[cnt++] = *b_sendbuf++;
1190         b_sendj++;
1191       }
1192     }
1193     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1194 
1195     /* -----------------------------------------------------------------
1196        Gather all numerical values to all processors
1197     */
1198     if (!recvcounts) {
1199       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1200     }
1201     for (i=0; i<size; i++) {
1202       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1203     }
1204     displs[0] = 0;
1205     for (i=1; i<size; i++) {
1206       displs[i] = displs[i-1] + recvcounts[i-1];
1207     }
1208     recvbuf = b->a;
1209 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1210     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1211 #else
1212     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1213 #endif
1214   }  /* endof (flag == MAT_GET_VALUES) */
1215   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1216 
1217   if (A->symmetric) {
1218     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1219   } else if (A->hermitian) {
1220     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1221   } else if (A->structurally_symmetric) {
1222     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1223   }
1224   PetscFunctionReturn(0);
1225 }
1226 
1227 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1228 {
1229   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1230   Mat            submat,A = c->A,B = c->B;
1231   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1232   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1233   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1234   const PetscInt *icol,*irow;
1235   PetscInt       nrow,ncol,start;
1236   PetscErrorCode ierr;
1237   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1238   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1239   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1240   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1241   PetscInt       *lens,rmax,ncols,*cols,Crow;
1242 #if defined(PETSC_USE_CTABLE)
1243   PetscTable     cmap,rmap;
1244   PetscInt       *cmap_loc,*rmap_loc;
1245 #else
1246   PetscInt       *cmap,*rmap;
1247 #endif
1248   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1249   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1250   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1251   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1252   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1253   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1254   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1255   MPI_Comm       comm;
1256   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1257   PetscMPIInt    *onodes1,*olengths1,idex,end;
1258   Mat_SubSppt    *smatis1;
1259   PetscBool      isrowsorted,iscolsorted;
1260 
1261   PetscFunctionBegin;
1262   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1263 
1264   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1265   size = c->size;
1266   rank = c->rank;
1267 
1268   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1269   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1270   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1271   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1272   if (allcolumns) {
1273     icol = NULL;
1274     ncol = C->cmap->N;
1275   } else {
1276     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1277     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1278   }
1279 
1280   if (scall == MAT_INITIAL_MATRIX) {
1281     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1282 
1283     /* Get some new tags to keep the communication clean */
1284     tag1 = ((PetscObject)C)->tag;
1285     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1286     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1287 
1288     /* evaluate communication - mesg to who, length of mesg, and buffer space
1289      required. Based on this, buffers are allocated, and data copied into them */
1290     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1291     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1292 
1293     /* w1[proc] = num of rows owned by proc -- to be requested */
1294     proc = 0;
1295     nrqs = 0; /* num of outgoing messages */
1296     for (j=0; j<nrow; j++) {
1297       row  = irow[j];
1298       if (!isrowsorted) proc = 0;
1299       while (row >= C->rmap->range[proc+1]) proc++;
1300       w1[proc]++;
1301       row2proc[j] = proc; /* map row index to proc */
1302 
1303       if (proc != rank && !w2[proc]) {
1304         w2[proc] = 1; nrqs++;
1305       }
1306     }
1307     w1[rank] = 0;  /* rows owned by self will not be requested */
1308 
1309     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1310     for (proc=0,j=0; proc<size; proc++) {
1311       if (w1[proc]) { pa[j++] = proc;}
1312     }
1313 
1314     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1315     msz = 0;              /* total mesg length (for all procs) */
1316     for (i=0; i<nrqs; i++) {
1317       proc      = pa[i];
1318       w1[proc] += 3;
1319       msz      += w1[proc];
1320     }
1321     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1322 
1323     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1324     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1325     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1326 
1327     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1328        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1329     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1330 
1331     /* Now post the Irecvs corresponding to these messages */
1332     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1333 
1334     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1335     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1336 
1337     /* Allocate Memory for outgoing messages */
1338     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1339     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1340     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1341 
1342     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1343     iptr = tmp;
1344     for (i=0; i<nrqs; i++) {
1345       proc        = pa[i];
1346       sbuf1[proc] = iptr;
1347       iptr       += w1[proc];
1348     }
1349 
1350     /* Form the outgoing messages */
1351     /* Initialize the header space */
1352     for (i=0; i<nrqs; i++) {
1353       proc      = pa[i];
1354       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1355       ptr[proc] = sbuf1[proc] + 3;
1356     }
1357 
1358     /* Parse the isrow and copy data into outbuf */
1359     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1360     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1361       proc = row2proc[j];
1362       if (proc != rank) { /* copy to the outgoing buf*/
1363         *ptr[proc] = irow[j];
1364         ctr[proc]++; ptr[proc]++;
1365       }
1366     }
1367 
1368     /* Update the headers for the current IS */
1369     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1370       if ((ctr_j = ctr[j])) {
1371         sbuf1_j        = sbuf1[j];
1372         k              = ++sbuf1_j[0];
1373         sbuf1_j[2*k]   = ctr_j;
1374         sbuf1_j[2*k-1] = 0;
1375       }
1376     }
1377 
1378     /* Now post the sends */
1379     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1380     for (i=0; i<nrqs; ++i) {
1381       proc = pa[i];
1382       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1383     }
1384 
1385     /* Post Receives to capture the buffer size */
1386     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1387     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1388 
1389     rbuf2[0] = tmp + msz;
1390     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1391 
1392     for (i=0; i<nrqs; ++i) {
1393       proc = pa[i];
1394       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1395     }
1396 
1397     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1398 
1399     /* Send to other procs the buf size they should allocate */
1400     /* Receive messages*/
1401     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1402     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1403 
1404     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1405     for (i=0; i<nrqr; ++i) {
1406       req_size[i] = 0;
1407       rbuf1_i        = rbuf1[i];
1408       start          = 2*rbuf1_i[0] + 1;
1409       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1410       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1411       sbuf2_i        = sbuf2[i];
1412       for (j=start; j<end; j++) {
1413         k            = rbuf1_i[j] - rstart;
1414         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1415         sbuf2_i[j]   = ncols;
1416         req_size[i] += ncols;
1417       }
1418       req_source1[i] = r_status1[i].MPI_SOURCE;
1419 
1420       /* form the header */
1421       sbuf2_i[0] = req_size[i];
1422       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1423 
1424       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1425     }
1426 
1427     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1428     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1429 
1430     /* rbuf2 is received, Post recv column indices a->j */
1431     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1432 
1433     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1434     for (i=0; i<nrqs; ++i) {
1435       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1436       req_source2[i] = r_status2[i].MPI_SOURCE;
1437       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1438     }
1439 
1440     /* Wait on sends1 and sends2 */
1441     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1442     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1443     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1444     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1445 
1446     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1447     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1448 
1449     /* Now allocate sending buffers for a->j, and send them off */
1450     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1451     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1452     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1453     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1454 
1455     for (i=0; i<nrqr; i++) { /* for each requested message */
1456       rbuf1_i   = rbuf1[i];
1457       sbuf_aj_i = sbuf_aj[i];
1458       ct1       = 2*rbuf1_i[0] + 1;
1459       ct2       = 0;
1460       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1461 
1462       kmax = rbuf1[i][2];
1463       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1464         row    = rbuf1_i[ct1] - rstart;
1465         nzA    = ai[row+1] - ai[row];
1466         nzB    = bi[row+1] - bi[row];
1467         ncols  = nzA + nzB;
1468         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1469 
1470         /* load the column indices for this row into cols*/
1471         cols = sbuf_aj_i + ct2;
1472 
1473         lwrite = 0;
1474         for (l=0; l<nzB; l++) {
1475           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1476         }
1477         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1478         for (l=0; l<nzB; l++) {
1479           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1480         }
1481 
1482         ct2 += ncols;
1483       }
1484       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1485     }
1486 
1487     /* create column map (cmap): global col of C -> local col of submat */
1488 #if defined(PETSC_USE_CTABLE)
1489     if (!allcolumns) {
1490       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1491       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1492       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1493         if (icol[j] >= cstart && icol[j] <cend) {
1494           cmap_loc[icol[j] - cstart] = j+1;
1495         } else { /* use PetscTable for non-local col indices */
1496           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1497         }
1498       }
1499     } else {
1500       cmap     = NULL;
1501       cmap_loc = NULL;
1502     }
1503     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1504 #else
1505     if (!allcolumns) {
1506       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1507       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1508     } else {
1509       cmap = NULL;
1510     }
1511 #endif
1512 
1513     /* Create lens for MatSeqAIJSetPreallocation() */
1514     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1515 
1516     /* Compute lens from local part of C */
1517     for (j=0; j<nrow; j++) {
1518       row  = irow[j];
1519       proc = row2proc[j];
1520       if (proc == rank) {
1521         /* diagonal part A = c->A */
1522         ncols = ai[row-rstart+1] - ai[row-rstart];
1523         cols  = aj + ai[row-rstart];
1524         if (!allcolumns) {
1525           for (k=0; k<ncols; k++) {
1526 #if defined(PETSC_USE_CTABLE)
1527             tcol = cmap_loc[cols[k]];
1528 #else
1529             tcol = cmap[cols[k]+cstart];
1530 #endif
1531             if (tcol) lens[j]++;
1532           }
1533         } else { /* allcolumns */
1534           lens[j] = ncols;
1535         }
1536 
1537         /* off-diagonal part B = c->B */
1538         ncols = bi[row-rstart+1] - bi[row-rstart];
1539         cols  = bj + bi[row-rstart];
1540         if (!allcolumns) {
1541           for (k=0; k<ncols; k++) {
1542 #if defined(PETSC_USE_CTABLE)
1543             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1544 #else
1545             tcol = cmap[bmap[cols[k]]];
1546 #endif
1547             if (tcol) lens[j]++;
1548           }
1549         } else { /* allcolumns */
1550           lens[j] += ncols;
1551         }
1552       }
1553     }
1554 
1555     /* Create row map (rmap): global row of C -> local row of submat */
1556 #if defined(PETSC_USE_CTABLE)
1557     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1558     for (j=0; j<nrow; j++) {
1559       row  = irow[j];
1560       proc = row2proc[j];
1561       if (proc == rank) { /* a local row */
1562         rmap_loc[row - rstart] = j;
1563       } else {
1564         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1565       }
1566     }
1567 #else
1568     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1569     for (j=0; j<nrow; j++) {
1570       rmap[irow[j]] = j;
1571     }
1572 #endif
1573 
1574     /* Update lens from offproc data */
1575     /* recv a->j is done */
1576     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1577     for (i=0; i<nrqs; i++) {
1578       proc    = pa[i];
1579       sbuf1_i = sbuf1[proc];
1580       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1581       ct1     = 2 + 1;
1582       ct2     = 0;
1583       rbuf2_i = rbuf2[i]; /* received length of C->j */
1584       rbuf3_i = rbuf3[i]; /* received C->j */
1585 
1586       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1587       max1   = sbuf1_i[2];
1588       for (k=0; k<max1; k++,ct1++) {
1589 #if defined(PETSC_USE_CTABLE)
1590         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1591         row--;
1592         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1593 #else
1594         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1595 #endif
1596         /* Now, store row index of submat in sbuf1_i[ct1] */
1597         sbuf1_i[ct1] = row;
1598 
1599         nnz = rbuf2_i[ct1];
1600         if (!allcolumns) {
1601           for (l=0; l<nnz; l++,ct2++) {
1602 #if defined(PETSC_USE_CTABLE)
1603             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1604               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1605             } else {
1606               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1607             }
1608 #else
1609             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1610 #endif
1611             if (tcol) lens[row]++;
1612           }
1613         } else { /* allcolumns */
1614           lens[row] += nnz;
1615         }
1616       }
1617     }
1618     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1619     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1620 
1621     /* Create the submatrices */
1622     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1623     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1624 
1625     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1626     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1627     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1628     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1629     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1630 
1631     /* create struct Mat_SubSppt and attached it to submat */
1632     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1633     subc = (Mat_SeqAIJ*)submat->data;
1634     subc->submatis1 = smatis1;
1635 
1636     smatis1->id          = 0;
1637     smatis1->nrqs        = nrqs;
1638     smatis1->nrqr        = nrqr;
1639     smatis1->rbuf1       = rbuf1;
1640     smatis1->rbuf2       = rbuf2;
1641     smatis1->rbuf3       = rbuf3;
1642     smatis1->sbuf2       = sbuf2;
1643     smatis1->req_source2 = req_source2;
1644 
1645     smatis1->sbuf1       = sbuf1;
1646     smatis1->ptr         = ptr;
1647     smatis1->tmp         = tmp;
1648     smatis1->ctr         = ctr;
1649 
1650     smatis1->pa           = pa;
1651     smatis1->req_size     = req_size;
1652     smatis1->req_source1  = req_source1;
1653 
1654     smatis1->allcolumns  = allcolumns;
1655     smatis1->singleis    = PETSC_TRUE;
1656     smatis1->row2proc    = row2proc;
1657     smatis1->rmap        = rmap;
1658     smatis1->cmap        = cmap;
1659 #if defined(PETSC_USE_CTABLE)
1660     smatis1->rmap_loc    = rmap_loc;
1661     smatis1->cmap_loc    = cmap_loc;
1662 #endif
1663 
1664     smatis1->destroy     = submat->ops->destroy;
1665     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1666     submat->factortype   = C->factortype;
1667 
1668     /* compute rmax */
1669     rmax = 0;
1670     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1671 
1672   } else { /* scall == MAT_REUSE_MATRIX */
1673     submat = submats[0];
1674     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1675 
1676     subc    = (Mat_SeqAIJ*)submat->data;
1677     rmax    = subc->rmax;
1678     smatis1 = subc->submatis1;
1679     nrqs        = smatis1->nrqs;
1680     nrqr        = smatis1->nrqr;
1681     rbuf1       = smatis1->rbuf1;
1682     rbuf2       = smatis1->rbuf2;
1683     rbuf3       = smatis1->rbuf3;
1684     req_source2 = smatis1->req_source2;
1685 
1686     sbuf1     = smatis1->sbuf1;
1687     sbuf2     = smatis1->sbuf2;
1688     ptr       = smatis1->ptr;
1689     tmp       = smatis1->tmp;
1690     ctr       = smatis1->ctr;
1691 
1692     pa         = smatis1->pa;
1693     req_size   = smatis1->req_size;
1694     req_source1 = smatis1->req_source1;
1695 
1696     allcolumns = smatis1->allcolumns;
1697     row2proc   = smatis1->row2proc;
1698     rmap       = smatis1->rmap;
1699     cmap       = smatis1->cmap;
1700 #if defined(PETSC_USE_CTABLE)
1701     rmap_loc   = smatis1->rmap_loc;
1702     cmap_loc   = smatis1->cmap_loc;
1703 #endif
1704   }
1705 
1706   /* Post recv matrix values */
1707   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1708   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1709   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1710   for (i=0; i<nrqs; ++i) {
1711     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1712     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1713   }
1714 
1715   /* Allocate sending buffers for a->a, and send them off */
1716   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1717   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1718   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1719   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1720 
1721   for (i=0; i<nrqr; i++) {
1722     rbuf1_i   = rbuf1[i];
1723     sbuf_aa_i = sbuf_aa[i];
1724     ct1       = 2*rbuf1_i[0]+1;
1725     ct2       = 0;
1726     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1727 
1728     kmax = rbuf1_i[2];
1729     for (k=0; k<kmax; k++,ct1++) {
1730       row = rbuf1_i[ct1] - rstart;
1731       nzA = ai[row+1] - ai[row];
1732       nzB = bi[row+1] - bi[row];
1733       ncols  = nzA + nzB;
1734       cworkB = bj + bi[row];
1735       vworkA = a_a + ai[row];
1736       vworkB = b_a + bi[row];
1737 
1738       /* load the column values for this row into vals*/
1739       vals = sbuf_aa_i + ct2;
1740 
1741       lwrite = 0;
1742       for (l=0; l<nzB; l++) {
1743         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1744       }
1745       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1746       for (l=0; l<nzB; l++) {
1747         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1748       }
1749 
1750       ct2 += ncols;
1751     }
1752     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1753   }
1754 
1755   /* Assemble submat */
1756   /* First assemble the local rows */
1757   for (j=0; j<nrow; j++) {
1758     row  = irow[j];
1759     proc = row2proc[j];
1760     if (proc == rank) {
1761       Crow = row - rstart;  /* local row index of C */
1762 #if defined(PETSC_USE_CTABLE)
1763       row = rmap_loc[Crow]; /* row index of submat */
1764 #else
1765       row = rmap[row];
1766 #endif
1767 
1768       if (allcolumns) {
1769         /* diagonal part A = c->A */
1770         ncols = ai[Crow+1] - ai[Crow];
1771         cols  = aj + ai[Crow];
1772         vals  = a->a + ai[Crow];
1773         i     = 0;
1774         for (k=0; k<ncols; k++) {
1775           subcols[i]   = cols[k] + cstart;
1776           subvals[i++] = vals[k];
1777         }
1778 
1779         /* off-diagonal part B = c->B */
1780         ncols = bi[Crow+1] - bi[Crow];
1781         cols  = bj + bi[Crow];
1782         vals  = b->a + bi[Crow];
1783         for (k=0; k<ncols; k++) {
1784           subcols[i]   = bmap[cols[k]];
1785           subvals[i++] = vals[k];
1786         }
1787 
1788         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1789 
1790       } else { /* !allcolumns */
1791 #if defined(PETSC_USE_CTABLE)
1792         /* diagonal part A = c->A */
1793         ncols = ai[Crow+1] - ai[Crow];
1794         cols  = aj + ai[Crow];
1795         vals  = a->a + ai[Crow];
1796         i     = 0;
1797         for (k=0; k<ncols; k++) {
1798           tcol = cmap_loc[cols[k]];
1799           if (tcol) {
1800             subcols[i]   = --tcol;
1801             subvals[i++] = vals[k];
1802           }
1803         }
1804 
1805         /* off-diagonal part B = c->B */
1806         ncols = bi[Crow+1] - bi[Crow];
1807         cols  = bj + bi[Crow];
1808         vals  = b->a + bi[Crow];
1809         for (k=0; k<ncols; k++) {
1810           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1811           if (tcol) {
1812             subcols[i]   = --tcol;
1813             subvals[i++] = vals[k];
1814           }
1815         }
1816 #else
1817         /* diagonal part A = c->A */
1818         ncols = ai[Crow+1] - ai[Crow];
1819         cols  = aj + ai[Crow];
1820         vals  = a->a + ai[Crow];
1821         i     = 0;
1822         for (k=0; k<ncols; k++) {
1823           tcol = cmap[cols[k]+cstart];
1824           if (tcol) {
1825             subcols[i]   = --tcol;
1826             subvals[i++] = vals[k];
1827           }
1828         }
1829 
1830         /* off-diagonal part B = c->B */
1831         ncols = bi[Crow+1] - bi[Crow];
1832         cols  = bj + bi[Crow];
1833         vals  = b->a + bi[Crow];
1834         for (k=0; k<ncols; k++) {
1835           tcol = cmap[bmap[cols[k]]];
1836           if (tcol) {
1837             subcols[i]   = --tcol;
1838             subvals[i++] = vals[k];
1839           }
1840         }
1841 #endif
1842         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1843       }
1844     }
1845   }
1846 
1847   /* Now assemble the off-proc rows */
1848   for (i=0; i<nrqs; i++) { /* for each requested message */
1849     /* recv values from other processes */
1850     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1851     proc    = pa[idex];
1852     sbuf1_i = sbuf1[proc];
1853     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1854     ct1     = 2 + 1;
1855     ct2     = 0; /* count of received C->j */
1856     ct3     = 0; /* count of received C->j that will be inserted into submat */
1857     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1858     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1859     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1860 
1861     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1862     max1 = sbuf1_i[2];             /* num of rows */
1863     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1864       row = sbuf1_i[ct1]; /* row index of submat */
1865       if (!allcolumns) {
1866         idex = 0;
1867         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1868           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1869           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1870 #if defined(PETSC_USE_CTABLE)
1871             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1872               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1873             } else {
1874               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1875             }
1876 #else
1877             tcol = cmap[rbuf3_i[ct2]];
1878 #endif
1879             if (tcol) {
1880               subcols[idex]   = --tcol; /* may not be sorted */
1881               subvals[idex++] = rbuf4_i[ct2];
1882 
1883               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1884                For reuse, we replace received C->j with index that should be inserted to submat */
1885               rbuf3_i[ct3++] = ct2;
1886             }
1887           }
1888           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1889         } else { /* scall == MAT_REUSE_MATRIX */
1890           submat = submats[0];
1891           subc   = (Mat_SeqAIJ*)submat->data;
1892 
1893           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1894           for (l=0; l<nnz; l++) {
1895             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1896             subvals[idex++] = rbuf4_i[ct2];
1897           }
1898 
1899           bj = subc->j + subc->i[row]; /* sorted column indices */
1900           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1901         }
1902       } else { /* allcolumns */
1903         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1904         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1905         ct2 += nnz;
1906       }
1907     }
1908   }
1909 
1910   /* sending a->a are done */
1911   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1912   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1913 
1914   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1915   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1916   submats[0] = submat;
1917 
1918   /* Restore the indices */
1919   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1920   if (!allcolumns) {
1921     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1922   }
1923 
1924   /* Destroy allocated memory */
1925   for (i=0; i<nrqs; ++i) {
1926     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1927   }
1928   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1929   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1930   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1931 
1932   if (scall == MAT_INITIAL_MATRIX) {
1933     ierr = PetscFree(lens);CHKERRQ(ierr);
1934     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1935     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1936   }
1937   PetscFunctionReturn(0);
1938 }
1939 
1940 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1941 {
1942   PetscErrorCode ierr;
1943   PetscInt       ncol;
1944   PetscBool      colflag,allcolumns=PETSC_FALSE;
1945 
1946   PetscFunctionBegin;
1947   /* Allocate memory to hold all the submatrices */
1948   if (scall == MAT_INITIAL_MATRIX) {
1949     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1950   }
1951 
1952   /* Check for special case: each processor gets entire matrix columns */
1953   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1954   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1955   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1956 
1957   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1958   PetscFunctionReturn(0);
1959 }
1960 
1961 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1962 {
1963   PetscErrorCode ierr;
1964   PetscInt       nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2];
1965   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1966   Mat_SeqAIJ     *subc;
1967   Mat_SubSppt    *smat;
1968 
1969   PetscFunctionBegin;
1970   /* Check for special case: each processor has a single IS */
1971   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1972     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1973     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1974     PetscFunctionReturn(0);
1975   }
1976 
1977   /* Collect global wantallmatrix and nstages */
1978   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1979   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1980   if (!nmax) nmax = 1;
1981 
1982   if (scall == MAT_INITIAL_MATRIX) {
1983     /* Collect global wantallmatrix and nstages */
1984     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1985       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1986       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1987       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1988       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1989       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1990         wantallmatrix = PETSC_TRUE;
1991 
1992         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1993       }
1994     }
1995 
1996     /* Determine the number of stages through which submatrices are done
1997        Each stage will extract nmax submatrices.
1998        nmax is determined by the matrix column dimension.
1999        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2000     */
2001     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2002 
2003     in[0] = -1*(PetscInt)wantallmatrix;
2004     in[1] = nstages;
2005     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2006     wantallmatrix = (PetscBool)(-out[0]);
2007     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2008 
2009   } else { /* MAT_REUSE_MATRIX */
2010     if (ismax) {
2011       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2012       smat = subc->submatis1;
2013     } else { /* (*submat)[0] is a dummy matrix */
2014       smat = (Mat_SubSppt*)(*submat)[0]->data;
2015     }
2016     if (!smat) {
2017       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2018       wantallmatrix = PETSC_TRUE;
2019     } else if (smat->singleis) {
2020       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2021       PetscFunctionReturn(0);
2022     } else {
2023       nstages = smat->nstages;
2024     }
2025   }
2026 
2027   if (wantallmatrix) {
2028     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2029     PetscFunctionReturn(0);
2030   }
2031 
2032   /* Allocate memory to hold all the submatrices and dummy submatrices */
2033   if (scall == MAT_INITIAL_MATRIX) {
2034     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2035   }
2036 
2037   for (i=0,pos=0; i<nstages; i++) {
2038     if (pos+nmax <= ismax) max_no = nmax;
2039     else if (pos == ismax) max_no = 0;
2040     else                   max_no = ismax-pos;
2041 
2042     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2043     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2044       smat = (Mat_SubSppt*)(*submat)[pos]->data;
2045       smat->nstages = nstages;
2046     }
2047     pos += max_no;
2048   }
2049 
2050   if (ismax && scall == MAT_INITIAL_MATRIX) {
2051     /* save nstages for reuse */
2052     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2053     smat = subc->submatis1;
2054     smat->nstages = nstages;
2055   }
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 /* -------------------------------------------------------------------------*/
2060 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2061 {
2062   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2063   Mat            A  = c->A;
2064   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2065   const PetscInt **icol,**irow;
2066   PetscInt       **iicol,**iirow;
2067   PetscInt       *nrow,*ncol,start;
2068   PetscErrorCode ierr;
2069   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2070   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2071   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2072   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2073   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2074 #if defined(PETSC_USE_CTABLE)
2075   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2076 #else
2077   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2078 #endif
2079   const PetscInt *irow_i;
2080   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2081   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2082   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2083   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2084   MPI_Status     *r_status3,*r_status4,*s_status4;
2085   MPI_Comm       comm;
2086   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2087   PetscMPIInt    *onodes1,*olengths1,end;
2088   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2089   Mat_SubSppt    *smat_i;
2090   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2091   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2092 
2093   PetscFunctionBegin;
2094   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2095   size = c->size;
2096   rank = c->rank;
2097 
2098   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2099   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2100 
2101   for (i=0; i<ismax; i++) {
2102     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2103     if (!issorted[i]) iscsorted = issorted[i];
2104 
2105     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2106 
2107     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2108     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2109 
2110     /* Check for special case: allcolumn */
2111     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2112     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2113     if (colflag && ncol[i] == C->cmap->N) {
2114       allcolumns[i] = PETSC_TRUE;
2115       icol[i] = NULL;
2116     } else {
2117       allcolumns[i] = PETSC_FALSE;
2118       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2119     }
2120   }
2121 
2122   if (scall == MAT_REUSE_MATRIX) {
2123     /* Assumes new rows are same length as the old rows */
2124     for (i=0; i<ismax; i++) {
2125       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2126       subc = (Mat_SeqAIJ*)submats[i]->data;
2127       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2128 
2129       /* Initial matrix as if empty */
2130       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2131 
2132       smat_i   = subc->submatis1;
2133 
2134       nrqs        = smat_i->nrqs;
2135       nrqr        = smat_i->nrqr;
2136       rbuf1       = smat_i->rbuf1;
2137       rbuf2       = smat_i->rbuf2;
2138       rbuf3       = smat_i->rbuf3;
2139       req_source2 = smat_i->req_source2;
2140 
2141       sbuf1     = smat_i->sbuf1;
2142       sbuf2     = smat_i->sbuf2;
2143       ptr       = smat_i->ptr;
2144       tmp       = smat_i->tmp;
2145       ctr       = smat_i->ctr;
2146 
2147       pa          = smat_i->pa;
2148       req_size    = smat_i->req_size;
2149       req_source1 = smat_i->req_source1;
2150 
2151       allcolumns[i] = smat_i->allcolumns;
2152       row2proc[i]   = smat_i->row2proc;
2153       rmap[i]       = smat_i->rmap;
2154       cmap[i]       = smat_i->cmap;
2155     }
2156 
2157     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2158       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2159       smat_i = (Mat_SubSppt*)submats[0]->data;
2160 
2161       nrqs        = smat_i->nrqs;
2162       nrqr        = smat_i->nrqr;
2163       rbuf1       = smat_i->rbuf1;
2164       rbuf2       = smat_i->rbuf2;
2165       rbuf3       = smat_i->rbuf3;
2166       req_source2 = smat_i->req_source2;
2167 
2168       sbuf1       = smat_i->sbuf1;
2169       sbuf2       = smat_i->sbuf2;
2170       ptr         = smat_i->ptr;
2171       tmp         = smat_i->tmp;
2172       ctr         = smat_i->ctr;
2173 
2174       pa          = smat_i->pa;
2175       req_size    = smat_i->req_size;
2176       req_source1 = smat_i->req_source1;
2177 
2178       allcolumns[0] = PETSC_FALSE;
2179     }
2180   } else { /* scall == MAT_INITIAL_MATRIX */
2181     /* Get some new tags to keep the communication clean */
2182     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2183     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2184 
2185     /* evaluate communication - mesg to who, length of mesg, and buffer space
2186      required. Based on this, buffers are allocated, and data copied into them*/
2187     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2188 
2189     for (i=0; i<ismax; i++) {
2190       jmax   = nrow[i];
2191       irow_i = irow[i];
2192 
2193       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2194       row2proc[i] = row2proc_i;
2195 
2196       if (issorted[i]) proc = 0;
2197       for (j=0; j<jmax; j++) {
2198         if (!issorted[i]) proc = 0;
2199         row = irow_i[j];
2200         while (row >= C->rmap->range[proc+1]) proc++;
2201         w4[proc]++;
2202         row2proc_i[j] = proc; /* map row index to proc */
2203       }
2204       for (j=0; j<size; j++) {
2205         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2206       }
2207     }
2208 
2209     nrqs     = 0;              /* no of outgoing messages */
2210     msz      = 0;              /* total mesg length (for all procs) */
2211     w1[rank] = 0;              /* no mesg sent to self */
2212     w3[rank] = 0;
2213     for (i=0; i<size; i++) {
2214       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2215     }
2216     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2217     for (i=0,j=0; i<size; i++) {
2218       if (w1[i]) { pa[j] = i; j++; }
2219     }
2220 
2221     /* Each message would have a header = 1 + 2*(no of IS) + data */
2222     for (i=0; i<nrqs; i++) {
2223       j      = pa[i];
2224       w1[j] += w2[j] + 2* w3[j];
2225       msz   += w1[j];
2226     }
2227     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2228 
2229     /* Determine the number of messages to expect, their lengths, from from-ids */
2230     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2231     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2232 
2233     /* Now post the Irecvs corresponding to these messages */
2234     tag0 = ((PetscObject)C)->tag;
2235     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2236 
2237     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2238     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2239 
2240     /* Allocate Memory for outgoing messages */
2241     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2242     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2243     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2244 
2245     {
2246       PetscInt *iptr = tmp;
2247       k    = 0;
2248       for (i=0; i<nrqs; i++) {
2249         j        = pa[i];
2250         iptr    += k;
2251         sbuf1[j] = iptr;
2252         k        = w1[j];
2253       }
2254     }
2255 
2256     /* Form the outgoing messages. Initialize the header space */
2257     for (i=0; i<nrqs; i++) {
2258       j           = pa[i];
2259       sbuf1[j][0] = 0;
2260       ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2261       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2262     }
2263 
2264     /* Parse the isrow and copy data into outbuf */
2265     for (i=0; i<ismax; i++) {
2266       row2proc_i = row2proc[i];
2267       ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2268       irow_i = irow[i];
2269       jmax   = nrow[i];
2270       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2271         proc = row2proc_i[j];
2272         if (proc != rank) { /* copy to the outgoing buf*/
2273           ctr[proc]++;
2274           *ptr[proc] = irow_i[j];
2275           ptr[proc]++;
2276         }
2277       }
2278       /* Update the headers for the current IS */
2279       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2280         if ((ctr_j = ctr[j])) {
2281           sbuf1_j        = sbuf1[j];
2282           k              = ++sbuf1_j[0];
2283           sbuf1_j[2*k]   = ctr_j;
2284           sbuf1_j[2*k-1] = i;
2285         }
2286       }
2287     }
2288 
2289     /*  Now  post the sends */
2290     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2291     for (i=0; i<nrqs; ++i) {
2292       j    = pa[i];
2293       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2294     }
2295 
2296     /* Post Receives to capture the buffer size */
2297     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2298     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2299     rbuf2[0] = tmp + msz;
2300     for (i=1; i<nrqs; ++i) {
2301       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2302     }
2303     for (i=0; i<nrqs; ++i) {
2304       j    = pa[i];
2305       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2306     }
2307 
2308     /* Send to other procs the buf size they should allocate */
2309     /* Receive messages*/
2310     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2311     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2312     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2313     {
2314       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2315       PetscInt   *sbuf2_i;
2316 
2317       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2318       for (i=0; i<nrqr; ++i) {
2319         req_size[i] = 0;
2320         rbuf1_i        = rbuf1[i];
2321         start          = 2*rbuf1_i[0] + 1;
2322         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2323         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2324         sbuf2_i        = sbuf2[i];
2325         for (j=start; j<end; j++) {
2326           id              = rbuf1_i[j] - rstart;
2327           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2328           sbuf2_i[j]      = ncols;
2329           req_size[i] += ncols;
2330         }
2331         req_source1[i] = r_status1[i].MPI_SOURCE;
2332         /* form the header */
2333         sbuf2_i[0] = req_size[i];
2334         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2335 
2336         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2337       }
2338     }
2339     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2340     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2341     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2342 
2343     /* Receive messages*/
2344     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2345     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2346 
2347     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2348     for (i=0; i<nrqs; ++i) {
2349       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2350       req_source2[i] = r_status2[i].MPI_SOURCE;
2351       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2352     }
2353     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2354     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2355 
2356     /* Wait on sends1 and sends2 */
2357     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2358     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2359 
2360     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2361     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2362     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2363     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2364     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2365     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2366 
2367     /* Now allocate sending buffers for a->j, and send them off */
2368     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2369     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2370     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2371     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2372 
2373     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2374     {
2375       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2376       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2377       PetscInt cend = C->cmap->rend;
2378       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2379 
2380       for (i=0; i<nrqr; i++) {
2381         rbuf1_i   = rbuf1[i];
2382         sbuf_aj_i = sbuf_aj[i];
2383         ct1       = 2*rbuf1_i[0] + 1;
2384         ct2       = 0;
2385         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2386           kmax = rbuf1[i][2*j];
2387           for (k=0; k<kmax; k++,ct1++) {
2388             row    = rbuf1_i[ct1] - rstart;
2389             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2390             ncols  = nzA + nzB;
2391             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2392 
2393             /* load the column indices for this row into cols */
2394             cols = sbuf_aj_i + ct2;
2395 
2396             lwrite = 0;
2397             for (l=0; l<nzB; l++) {
2398               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2399             }
2400             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2401             for (l=0; l<nzB; l++) {
2402               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2403             }
2404 
2405             ct2 += ncols;
2406           }
2407         }
2408         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2409       }
2410     }
2411     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2412 
2413     /* create col map: global col of C -> local col of submatrices */
2414     {
2415       const PetscInt *icol_i;
2416 #if defined(PETSC_USE_CTABLE)
2417       for (i=0; i<ismax; i++) {
2418         if (!allcolumns[i]) {
2419           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2420 
2421           jmax   = ncol[i];
2422           icol_i = icol[i];
2423           cmap_i = cmap[i];
2424           for (j=0; j<jmax; j++) {
2425             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2426           }
2427         } else cmap[i] = NULL;
2428       }
2429 #else
2430       for (i=0; i<ismax; i++) {
2431         if (!allcolumns[i]) {
2432           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2433           jmax   = ncol[i];
2434           icol_i = icol[i];
2435           cmap_i = cmap[i];
2436           for (j=0; j<jmax; j++) {
2437             cmap_i[icol_i[j]] = j+1;
2438           }
2439         } else cmap[i] = NULL;
2440       }
2441 #endif
2442     }
2443 
2444     /* Create lens which is required for MatCreate... */
2445     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2446     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2447 
2448     if (ismax) {
2449       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2450     }
2451     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2452 
2453     /* Update lens from local data */
2454     for (i=0; i<ismax; i++) {
2455       row2proc_i = row2proc[i];
2456       jmax = nrow[i];
2457       if (!allcolumns[i]) cmap_i = cmap[i];
2458       irow_i = irow[i];
2459       lens_i = lens[i];
2460       for (j=0; j<jmax; j++) {
2461         row = irow_i[j];
2462         proc = row2proc_i[j];
2463         if (proc == rank) {
2464           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2465           if (!allcolumns[i]) {
2466             for (k=0; k<ncols; k++) {
2467 #if defined(PETSC_USE_CTABLE)
2468               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2469 #else
2470               tcol = cmap_i[cols[k]];
2471 #endif
2472               if (tcol) lens_i[j]++;
2473             }
2474           } else { /* allcolumns */
2475             lens_i[j] = ncols;
2476           }
2477           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2478         }
2479       }
2480     }
2481 
2482     /* Create row map: global row of C -> local row of submatrices */
2483 #if defined(PETSC_USE_CTABLE)
2484     for (i=0; i<ismax; i++) {
2485       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2486       irow_i = irow[i];
2487       jmax   = nrow[i];
2488       for (j=0; j<jmax; j++) {
2489       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2490       }
2491     }
2492 #else
2493     for (i=0; i<ismax; i++) {
2494       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2495       rmap_i = rmap[i];
2496       irow_i = irow[i];
2497       jmax   = nrow[i];
2498       for (j=0; j<jmax; j++) {
2499         rmap_i[irow_i[j]] = j;
2500       }
2501     }
2502 #endif
2503 
2504     /* Update lens from offproc data */
2505     {
2506       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2507 
2508       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2509       for (tmp2=0; tmp2<nrqs; tmp2++) {
2510         sbuf1_i = sbuf1[pa[tmp2]];
2511         jmax    = sbuf1_i[0];
2512         ct1     = 2*jmax+1;
2513         ct2     = 0;
2514         rbuf2_i = rbuf2[tmp2];
2515         rbuf3_i = rbuf3[tmp2];
2516         for (j=1; j<=jmax; j++) {
2517           is_no  = sbuf1_i[2*j-1];
2518           max1   = sbuf1_i[2*j];
2519           lens_i = lens[is_no];
2520           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2521           rmap_i = rmap[is_no];
2522           for (k=0; k<max1; k++,ct1++) {
2523 #if defined(PETSC_USE_CTABLE)
2524             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2525             row--;
2526             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2527 #else
2528             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2529 #endif
2530             max2 = rbuf2_i[ct1];
2531             for (l=0; l<max2; l++,ct2++) {
2532               if (!allcolumns[is_no]) {
2533 #if defined(PETSC_USE_CTABLE)
2534                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2535 #else
2536                 tcol = cmap_i[rbuf3_i[ct2]];
2537 #endif
2538                 if (tcol) lens_i[row]++;
2539               } else { /* allcolumns */
2540                 lens_i[row]++; /* lens_i[row] += max2 ? */
2541               }
2542             }
2543           }
2544         }
2545       }
2546     }
2547     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2548     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2549     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2550     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2551 
2552     /* Create the submatrices */
2553     for (i=0; i<ismax; i++) {
2554       PetscInt    rbs,cbs;
2555 
2556       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2557       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2558 
2559       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2560       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2561 
2562       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2563       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2564       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2565 
2566       /* create struct Mat_SubSppt and attached it to submat */
2567       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2568       subc = (Mat_SeqAIJ*)submats[i]->data;
2569       subc->submatis1 = smat_i;
2570 
2571       smat_i->destroy          = submats[i]->ops->destroy;
2572       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2573       submats[i]->factortype   = C->factortype;
2574 
2575       smat_i->id          = i;
2576       smat_i->nrqs        = nrqs;
2577       smat_i->nrqr        = nrqr;
2578       smat_i->rbuf1       = rbuf1;
2579       smat_i->rbuf2       = rbuf2;
2580       smat_i->rbuf3       = rbuf3;
2581       smat_i->sbuf2       = sbuf2;
2582       smat_i->req_source2 = req_source2;
2583 
2584       smat_i->sbuf1       = sbuf1;
2585       smat_i->ptr         = ptr;
2586       smat_i->tmp         = tmp;
2587       smat_i->ctr         = ctr;
2588 
2589       smat_i->pa           = pa;
2590       smat_i->req_size     = req_size;
2591       smat_i->req_source1  = req_source1;
2592 
2593       smat_i->allcolumns  = allcolumns[i];
2594       smat_i->singleis    = PETSC_FALSE;
2595       smat_i->row2proc    = row2proc[i];
2596       smat_i->rmap        = rmap[i];
2597       smat_i->cmap        = cmap[i];
2598     }
2599 
2600     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2601       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2602       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2603       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2604 
2605       /* create struct Mat_SubSppt and attached it to submat */
2606       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2607       submats[0]->data = (void*)smat_i;
2608 
2609       smat_i->destroy          = submats[0]->ops->destroy;
2610       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2611       submats[0]->factortype   = C->factortype;
2612 
2613       smat_i->id          = 0;
2614       smat_i->nrqs        = nrqs;
2615       smat_i->nrqr        = nrqr;
2616       smat_i->rbuf1       = rbuf1;
2617       smat_i->rbuf2       = rbuf2;
2618       smat_i->rbuf3       = rbuf3;
2619       smat_i->sbuf2       = sbuf2;
2620       smat_i->req_source2 = req_source2;
2621 
2622       smat_i->sbuf1       = sbuf1;
2623       smat_i->ptr         = ptr;
2624       smat_i->tmp         = tmp;
2625       smat_i->ctr         = ctr;
2626 
2627       smat_i->pa           = pa;
2628       smat_i->req_size     = req_size;
2629       smat_i->req_source1  = req_source1;
2630 
2631       smat_i->allcolumns  = PETSC_FALSE;
2632       smat_i->singleis    = PETSC_FALSE;
2633       smat_i->row2proc    = NULL;
2634       smat_i->rmap        = NULL;
2635       smat_i->cmap        = NULL;
2636     }
2637 
2638     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2639     ierr = PetscFree(lens);CHKERRQ(ierr);
2640     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2641     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2642 
2643   } /* endof scall == MAT_INITIAL_MATRIX */
2644 
2645   /* Post recv matrix values */
2646   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2647   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2648   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2649   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2650   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2651   for (i=0; i<nrqs; ++i) {
2652     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2653     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2654   }
2655 
2656   /* Allocate sending buffers for a->a, and send them off */
2657   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2658   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2659   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2660   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2661 
2662   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2663   {
2664     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2665     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2666     PetscInt    cend   = C->cmap->rend;
2667     PetscInt    *b_j   = b->j;
2668     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2669 
2670     for (i=0; i<nrqr; i++) {
2671       rbuf1_i   = rbuf1[i];
2672       sbuf_aa_i = sbuf_aa[i];
2673       ct1       = 2*rbuf1_i[0]+1;
2674       ct2       = 0;
2675       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2676         kmax = rbuf1_i[2*j];
2677         for (k=0; k<kmax; k++,ct1++) {
2678           row    = rbuf1_i[ct1] - rstart;
2679           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2680           ncols  = nzA + nzB;
2681           cworkB = b_j + b_i[row];
2682           vworkA = a_a + a_i[row];
2683           vworkB = b_a + b_i[row];
2684 
2685           /* load the column values for this row into vals*/
2686           vals = sbuf_aa_i+ct2;
2687 
2688           lwrite = 0;
2689           for (l=0; l<nzB; l++) {
2690             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2691           }
2692           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2693           for (l=0; l<nzB; l++) {
2694             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2695           }
2696 
2697           ct2 += ncols;
2698         }
2699       }
2700       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2701     }
2702   }
2703 
2704   /* Assemble the matrices */
2705   /* First assemble the local rows */
2706   for (i=0; i<ismax; i++) {
2707     row2proc_i = row2proc[i];
2708     subc      = (Mat_SeqAIJ*)submats[i]->data;
2709     imat_ilen = subc->ilen;
2710     imat_j    = subc->j;
2711     imat_i    = subc->i;
2712     imat_a    = subc->a;
2713 
2714     if (!allcolumns[i]) cmap_i = cmap[i];
2715     rmap_i = rmap[i];
2716     irow_i = irow[i];
2717     jmax   = nrow[i];
2718     for (j=0; j<jmax; j++) {
2719       row  = irow_i[j];
2720       proc = row2proc_i[j];
2721       if (proc == rank) {
2722         old_row = row;
2723 #if defined(PETSC_USE_CTABLE)
2724         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2725         row--;
2726 #else
2727         row = rmap_i[row];
2728 #endif
2729         ilen_row = imat_ilen[row];
2730         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2731         mat_i    = imat_i[row];
2732         mat_a    = imat_a + mat_i;
2733         mat_j    = imat_j + mat_i;
2734         if (!allcolumns[i]) {
2735           for (k=0; k<ncols; k++) {
2736 #if defined(PETSC_USE_CTABLE)
2737             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2738 #else
2739             tcol = cmap_i[cols[k]];
2740 #endif
2741             if (tcol) {
2742               *mat_j++ = tcol - 1;
2743               *mat_a++ = vals[k];
2744               ilen_row++;
2745             }
2746           }
2747         } else { /* allcolumns */
2748           for (k=0; k<ncols; k++) {
2749             *mat_j++ = cols[k];  /* global col index! */
2750             *mat_a++ = vals[k];
2751             ilen_row++;
2752           }
2753         }
2754         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2755 
2756         imat_ilen[row] = ilen_row;
2757       }
2758     }
2759   }
2760 
2761   /* Now assemble the off proc rows */
2762   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2763   for (tmp2=0; tmp2<nrqs; tmp2++) {
2764     sbuf1_i = sbuf1[pa[tmp2]];
2765     jmax    = sbuf1_i[0];
2766     ct1     = 2*jmax + 1;
2767     ct2     = 0;
2768     rbuf2_i = rbuf2[tmp2];
2769     rbuf3_i = rbuf3[tmp2];
2770     rbuf4_i = rbuf4[tmp2];
2771     for (j=1; j<=jmax; j++) {
2772       is_no     = sbuf1_i[2*j-1];
2773       rmap_i    = rmap[is_no];
2774       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2775       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2776       imat_ilen = subc->ilen;
2777       imat_j    = subc->j;
2778       imat_i    = subc->i;
2779       imat_a    = subc->a;
2780       max1      = sbuf1_i[2*j];
2781       for (k=0; k<max1; k++,ct1++) {
2782         row = sbuf1_i[ct1];
2783 #if defined(PETSC_USE_CTABLE)
2784         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2785         row--;
2786 #else
2787         row = rmap_i[row];
2788 #endif
2789         ilen  = imat_ilen[row];
2790         mat_i = imat_i[row];
2791         mat_a = imat_a + mat_i;
2792         mat_j = imat_j + mat_i;
2793         max2  = rbuf2_i[ct1];
2794         if (!allcolumns[is_no]) {
2795           for (l=0; l<max2; l++,ct2++) {
2796 #if defined(PETSC_USE_CTABLE)
2797             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2798 #else
2799             tcol = cmap_i[rbuf3_i[ct2]];
2800 #endif
2801             if (tcol) {
2802               *mat_j++ = tcol - 1;
2803               *mat_a++ = rbuf4_i[ct2];
2804               ilen++;
2805             }
2806           }
2807         } else { /* allcolumns */
2808           for (l=0; l<max2; l++,ct2++) {
2809             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2810             *mat_a++ = rbuf4_i[ct2];
2811             ilen++;
2812           }
2813         }
2814         imat_ilen[row] = ilen;
2815       }
2816     }
2817   }
2818 
2819   if (!iscsorted) { /* sort column indices of the rows */
2820     for (i=0; i<ismax; i++) {
2821       subc      = (Mat_SeqAIJ*)submats[i]->data;
2822       imat_j    = subc->j;
2823       imat_i    = subc->i;
2824       imat_a    = subc->a;
2825       imat_ilen = subc->ilen;
2826 
2827       if (allcolumns[i]) continue;
2828       jmax = nrow[i];
2829       for (j=0; j<jmax; j++) {
2830         mat_i = imat_i[j];
2831         mat_a = imat_a + mat_i;
2832         mat_j = imat_j + mat_i;
2833         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2834       }
2835     }
2836   }
2837 
2838   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2839   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2840   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2841   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2842   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2843 
2844   /* Restore the indices */
2845   for (i=0; i<ismax; i++) {
2846     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2847     if (!allcolumns[i]) {
2848       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2849     }
2850   }
2851 
2852   for (i=0; i<ismax; i++) {
2853     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2854     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2855   }
2856 
2857   /* Destroy allocated memory */
2858   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2859   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2860   iirow = (PetscInt**) irow;  /* required because PetscFreeN() cannot work on const */
2861   iicol = (PetscInt**) icol;
2862   ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr);
2863 
2864   for (i=0; i<nrqs; ++i) {
2865     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2866   }
2867   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2868 
2869   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2870   PetscFunctionReturn(0);
2871 }
2872 
2873 /*
2874  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2875  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2876  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2877  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2878  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2879  state, and needs to be "assembled" later by compressing B's column space.
2880 
2881  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2882  Following this call, C->A & C->B have been created, even if empty.
2883  */
2884 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2885 {
2886   /* If making this function public, change the error returned in this function away from _PLIB. */
2887   PetscErrorCode ierr;
2888   Mat_MPIAIJ     *aij;
2889   Mat_SeqAIJ     *Baij;
2890   PetscBool      seqaij,Bdisassembled;
2891   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2892   PetscScalar    v;
2893   const PetscInt *rowindices,*colindices;
2894 
2895   PetscFunctionBegin;
2896   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2897   if (A) {
2898     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2899     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2900     if (rowemb) {
2901       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2902       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2903     } else {
2904       if (C->rmap->n != A->rmap->n) {
2905 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2906       }
2907     }
2908     if (dcolemb) {
2909       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2910       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2911     } else {
2912       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2913     }
2914   }
2915   if (B) {
2916     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2917     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2918     if (rowemb) {
2919       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2920       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2921     } else {
2922       if (C->rmap->n != B->rmap->n) {
2923 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2924       }
2925     }
2926     if (ocolemb) {
2927       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2928       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2929     } else {
2930       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2931     }
2932   }
2933 
2934   aij = (Mat_MPIAIJ*)C->data;
2935   if (!aij->A) {
2936     /* Mimic parts of MatMPIAIJSetPreallocation() */
2937     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2938     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2939     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2940     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2941     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2942   }
2943   if (A) {
2944     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2945   } else {
2946     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2947   }
2948   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2949     /*
2950       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2951       need to "disassemble" B -- convert it to using C's global indices.
2952       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2953 
2954       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2955       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2956 
2957       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2958       At least avoid calling MatSetValues() and the implied searches?
2959     */
2960 
2961     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2962 #if defined(PETSC_USE_CTABLE)
2963       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2964 #else
2965       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2966       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2967       if (aij->B) {
2968         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2969       }
2970 #endif
2971       ngcol = 0;
2972       if (aij->lvec) {
2973 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2974       }
2975       if (aij->garray) {
2976 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2977 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2978       }
2979       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2980       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2981     }
2982     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2983       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2984     }
2985     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2986       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2987     }
2988   }
2989   Bdisassembled = PETSC_FALSE;
2990   if (!aij->B) {
2991     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2992     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2993     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2994     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2995     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2996     Bdisassembled = PETSC_TRUE;
2997   }
2998   if (B) {
2999     Baij = (Mat_SeqAIJ*)B->data;
3000     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3001       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3002       for (i=0; i<B->rmap->n; i++) {
3003 	nz[i] = Baij->i[i+1] - Baij->i[i];
3004       }
3005       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3006       ierr = PetscFree(nz);CHKERRQ(ierr);
3007     }
3008 
3009     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3010     shift = rend-rstart;
3011     count = 0;
3012     rowindices = NULL;
3013     colindices = NULL;
3014     if (rowemb) {
3015       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3016     }
3017     if (ocolemb) {
3018       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3019     }
3020     for (i=0; i<B->rmap->n; i++) {
3021       PetscInt row;
3022       row = i;
3023       if (rowindices) row = rowindices[i];
3024       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3025 	col  = Baij->j[count];
3026 	if (colindices) col = colindices[col];
3027 	if (Bdisassembled && col>=rstart) col += shift;
3028 	v    = Baij->a[count];
3029 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3030 	++count;
3031       }
3032     }
3033     /* No assembly for aij->B is necessary. */
3034     /* FIXME: set aij->B's nonzerostate correctly. */
3035   } else {
3036     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3037   }
3038   C->preallocated  = PETSC_TRUE;
3039   C->was_assembled = PETSC_FALSE;
3040   C->assembled     = PETSC_FALSE;
3041    /*
3042       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3043       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3044    */
3045   PetscFunctionReturn(0);
3046 }
3047 
3048 /*
3049   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3050  */
3051 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3052 {
3053   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3054 
3055   PetscFunctionBegin;
3056   PetscValidPointer(A,2);
3057   PetscValidPointer(B,3);
3058   /* FIXME: make sure C is assembled */
3059   *A = aij->A;
3060   *B = aij->B;
3061   /* Note that we don't incref *A and *B, so be careful! */
3062   PetscFunctionReturn(0);
3063 }
3064 
3065 /*
3066   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3067   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3068 */
3069 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3070                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3071 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3072 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3073 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3074 {
3075   PetscErrorCode ierr;
3076   PetscMPIInt    isize,flag;
3077   PetscInt       i,ii,cismax,ispar;
3078   Mat            *A,*B;
3079   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3080 
3081   PetscFunctionBegin;
3082   if (!ismax) PetscFunctionReturn(0);
3083 
3084   for (i = 0, cismax = 0; i < ismax; ++i) {
3085     PetscMPIInt isize;
3086     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3087     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3088     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3089     if (isize > 1) ++cismax;
3090   }
3091 
3092   /*
3093      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3094      ispar counts the number of parallel ISs across C's comm.
3095   */
3096   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3097   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3098     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3099     PetscFunctionReturn(0);
3100   }
3101 
3102   /* if (ispar) */
3103   /*
3104     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3105     These are used to extract the off-diag portion of the resulting parallel matrix.
3106     The row IS for the off-diag portion is the same as for the diag portion,
3107     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3108   */
3109   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3110   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3111   for (i = 0, ii = 0; i < ismax; ++i) {
3112     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3113     if (isize > 1) {
3114       /*
3115 	 TODO: This is the part that's ***NOT SCALABLE***.
3116 	 To fix this we need to extract just the indices of C's nonzero columns
3117 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3118 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3119 	 to be done without serializing on the IS list, so, most likely, it is best
3120 	 done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3121       */
3122       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3123       /* Now we have to
3124 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3125 	     were sorted on each rank, concatenated they might no longer be sorted;
3126 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3127 	     indices in the nondecreasing order to the original index positions.
3128 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3129       */
3130       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3131       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3132       ++ii;
3133     }
3134   }
3135   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3136   for (i = 0, ii = 0; i < ismax; ++i) {
3137     PetscInt       j,issize;
3138     const PetscInt *indices;
3139 
3140     /*
3141        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3142      */
3143     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3144     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3145     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3146     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3147     for (j = 1; j < issize; ++j) {
3148       if (indices[j] == indices[j-1]) {
3149 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3150       }
3151     }
3152     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3153 
3154 
3155     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3156     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3157     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3158     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3159     for (j = 1; j < issize; ++j) {
3160       if (indices[j-1] == indices[j]) {
3161 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3162       }
3163     }
3164     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3165     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3166     if (isize > 1) {
3167       cisrow[ii] = isrow[i];
3168       ++ii;
3169     }
3170   }
3171   /*
3172     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3173     array of sequential matrices underlying the resulting parallel matrices.
3174     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3175     contain duplicates.
3176 
3177     There are as many diag matrices as there are original index sets. There are only as many parallel
3178     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3179 
3180     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3181     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3182       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3183       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3184       with A[i] and B[ii] extracted from the corresponding MPI submat.
3185     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3186       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3187       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3188       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3189       values into A[i] and B[ii] sitting inside the corresponding submat.
3190     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3191       A[i], B[ii], AA[i] or BB[ii] matrices.
3192   */
3193   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3194   if (scall == MAT_INITIAL_MATRIX) {
3195     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3196   }
3197 
3198   /* Now obtain the sequential A and B submatrices separately. */
3199   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3200   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3201   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3202 
3203   /*
3204     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3205     matrices A & B have been extracted directly into the parallel matrices containing them, or
3206     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3207     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3208     to have the same sparsity pattern.
3209     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3210     must be constructed for C. This is done by setseqmat(s).
3211   */
3212   for (i = 0, ii = 0; i < ismax; ++i) {
3213     /*
3214        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3215        That way we can avoid sorting and computing permutations when reusing.
3216        To this end:
3217         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3218 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3219 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3220     */
3221     MatStructure pattern;
3222     pattern = DIFFERENT_NONZERO_PATTERN;
3223 
3224     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3225     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3226     if (isize > 1) {
3227       if (scall == MAT_INITIAL_MATRIX) {
3228 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3229 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3230 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3231 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3232 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3233       }
3234       /*
3235 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3236       */
3237       {
3238 	Mat AA,BB;
3239         AA = A[i];
3240         BB = B[ii];
3241 	if (AA || BB) {
3242 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3243 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3244 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3245 	}
3246 
3247         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3248       }
3249       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3250       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3251       ++ii;
3252     } else { /* if (isize == 1) */
3253       if (scall == MAT_REUSE_MATRIX) {
3254         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3255       }
3256       if (isrow_p[i] || iscol_p[i]) {
3257         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3258         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3259 	/* Otherwise A is extracted straight into (*submats)[i]. */
3260 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3261 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3262       } else (*submat)[i] = A[i];
3263     }
3264     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3265     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3266   }
3267   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3268   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3269   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3270   ierr = PetscFree(A);CHKERRQ(ierr);
3271   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3272   PetscFunctionReturn(0);
3273 }
3274 
3275 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3276 {
3277   PetscErrorCode ierr;
3278 
3279   PetscFunctionBegin;
3280   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3281   PetscFunctionReturn(0);
3282 }
3283