xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 4863603af06b16768e4e4a060cedf7d42fa55e81)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++) {
82     length_i  = length[i];
83     indices_i = indices[i];
84     for (j=0; j<length_i; j++) {
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank) {
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows) {
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++) {
115     if (tosizes_temp[i]) {
116       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117       tosizes_temp[i] = nto; /* a map from processor to index */
118       toranks[nto++]  = i; /* processor */
119     }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++) {
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++) {
133     for (j=0; j<fromsizes[2*i]; j++) {
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++) {
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++) {
169     for (j=0; j<tosizes[2*i]; j++) {
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   MPI_Comm         *iscomms;
198   PetscErrorCode    ierr;
199 
200   PetscFunctionBegin;
201   max_lsize = 0;
202   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
203   for (i=0; i<nidx; i++){
204     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
205     max_lsize = lsize>max_lsize ? lsize:max_lsize;
206     isz[i]    = lsize;
207   }
208   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
209   for (i=0; i<nidx; i++){
210     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
211     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
213     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
214     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
215   }
216   /* retrieve information to get row id and its overlap */
217   for (i=0; i<nrecvs; ){
218     is_id     = recvdata[i++];
219     data_size = recvdata[i++];
220     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
221     isz_i     = isz[is_id];
222     for (j=0; j< data_size; j++){
223       col = recvdata[i++];
224       indices_i[isz_i++] = col;
225     }
226     isz[is_id] = isz_i;
227   }
228   /* remove duplicate entities */
229   for (i=0; i<nidx; i++){
230     indices_i = indices_temp+(max_lsize+nrecvs)*i;
231     isz_i     = isz[i];
232     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
233     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
234     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
235   }
236   ierr = PetscFree(isz);CHKERRQ(ierr);
237   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
242 {
243   PetscLayout       rmap,cmap;
244   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
245   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
246   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
247   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
248   Mat               amat,bmat;
249   PetscMPIInt       rank;
250   PetscBool         done;
251   MPI_Comm          comm;
252   PetscErrorCode    ierr;
253 
254   PetscFunctionBegin;
255   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
256   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
257   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
258   /* Even if the mat is symmetric, we still assume it is not symmetric */
259   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
260   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
261   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
262   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
263   /* total number of nonzero values is used to estimate the memory usage in the next step */
264   tnz  = ai[an]+bi[bn];
265   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
267   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
268   /* to find the longest message */
269   max_fszs = 0;
270   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
271   /* better way to estimate number of nonzero in the mat??? */
272   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
273   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
274   rows_pos  = 0;
275   totalrows = 0;
276   for (i=0; i<nfrom; i++){
277     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
278     /* group data together */
279     for (j=0; j<fromsizes[2*i]; j+=2){
280       is_id                       = fromrows[rows_pos++];/* no of is */
281       rows_i                      = rows_data[is_id];
282       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
283     }
284     /* estimate a space to avoid multiple allocations  */
285     for (j=0; j<nidx; j++){
286       indvc_ij = 0;
287       rows_i   = rows_data[j];
288       for (l=0; l<rows_pos_i[j]; l++){
289         row    = rows_i[l]-rstart;
290         start  = ai[row];
291         end    = ai[row+1];
292         for (k=start; k<end; k++){ /* Amat */
293           col = aj[k] + cstart;
294           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
295         }
296         start = bi[row];
297         end   = bi[row+1];
298         for (k=start; k<end; k++) { /* Bmat */
299           col = gcols[bj[k]];
300           indices_tmp[indvc_ij++] = col;
301         }
302       }
303       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
304       indv_counts[i*nidx+j] = indvc_ij;
305       totalrows            += indvc_ij;
306     }
307   }
308   /* message triple <no of is, number of rows, rows> */
309   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
310   totalrows = 0;
311   rows_pos  = 0;
312   /* use this code again */
313   for (i=0;i<nfrom;i++){
314     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
315     for (j=0; j<fromsizes[2*i]; j+=2){
316       is_id                       = fromrows[rows_pos++];
317       rows_i                      = rows_data[is_id];
318       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
319     }
320     /* add data  */
321     for (j=0; j<nidx; j++){
322       if (!indv_counts[i*nidx+j]) continue;
323       indvc_ij = 0;
324       sbdata[totalrows++] = j;
325       sbdata[totalrows++] = indv_counts[i*nidx+j];
326       sbsizes[2*i]       += 2;
327       rows_i              = rows_data[j];
328       for (l=0; l<rows_pos_i[j]; l++){
329         row   = rows_i[l]-rstart;
330         start = ai[row];
331         end   = ai[row+1];
332         for (k=start; k<end; k++){ /* Amat */
333           col = aj[k] + cstart;
334           indices_tmp[indvc_ij++] = col;
335         }
336         start = bi[row];
337         end   = bi[row+1];
338         for (k=start; k<end; k++) { /* Bmat */
339           col = gcols[bj[k]];
340           indices_tmp[indvc_ij++] = col;
341         }
342       }
343       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
344       sbsizes[2*i]  += indvc_ij;
345       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
346       totalrows += indvc_ij;
347     }
348   }
349   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
350   for (i=0; i<nfrom; i++){
351     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352     sbsizes[2*i+1] = offsets[i];
353   }
354   ierr = PetscFree(offsets);CHKERRQ(ierr);
355   if (sbrowsizes) *sbrowsizes = sbsizes;
356   if (sbrows) *sbrows = sbdata;
357   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
359   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364 {
365   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367   PetscInt          lsize,lsize_tmp,owner;
368   PetscMPIInt       rank;
369   Mat               amat,bmat;
370   PetscBool         done;
371   PetscLayout       cmap,rmap;
372   MPI_Comm          comm;
373   PetscErrorCode    ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
377   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
378   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
379   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
380   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   /* is it a safe way to compute number of nonzero values ? */
384   tnz  = ai[an]+bi[bn];
385   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
387   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
388   /* it is a better way to estimate memory than the old implementation
389    * where global size of matrix is used
390    * */
391   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
392   for (i=0; i<nidx; i++) {
393     MPI_Comm iscomm;
394 
395     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
396     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
397     lsize_tmp = 0;
398     for (j=0; j<lsize; j++) {
399       owner = -1;
400       row   = indices[j];
401       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
402       if (owner != rank) continue;
403       /* local number */
404       row  -= rstart;
405       start = ai[row];
406       end   = ai[row+1];
407       for (k=start; k<end; k++) { /* Amat */
408         col = aj[k] + cstart;
409         indices_temp[lsize_tmp++] = col;
410       }
411       start = bi[row];
412       end   = bi[row+1];
413       for (k=start; k<end; k++) { /* Bmat */
414         col = gcols[bj[k]];
415         indices_temp[lsize_tmp++] = col;
416       }
417     }
418    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
419    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
420    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
421    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
422    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
423    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
424   }
425   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
426   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
427   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 
432 /*
433   Sample message format:
434   If a processor A wants processor B to process some elements corresponding
435   to index sets is[1],is[5]
436   mesg [0] = 2   (no of index sets in the mesg)
437   -----------
438   mesg [1] = 1 => is[1]
439   mesg [2] = sizeof(is[1]);
440   -----------
441   mesg [3] = 5  => is[5]
442   mesg [4] = sizeof(is[5]);
443   -----------
444   mesg [5]
445   mesg [n]  datas[1]
446   -----------
447   mesg[n+1]
448   mesg[m]  data(is[5])
449   -----------
450 
451   Notes:
452   nrqs - no of requests sent (or to be sent out)
453   nrqr - no of requests recieved (which have to be or which have been processed
454 */
455 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
456 {
457   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
458   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
459   const PetscInt **idx,*idx_i;
460   PetscInt       *n,**data,len;
461 #if defined(PETSC_USE_CTABLE)
462   PetscTable     *table_data,table_data_i;
463   PetscInt       *tdata,tcount,tcount_max;
464 #else
465   PetscInt       *data_i,*d_p;
466 #endif
467   PetscErrorCode ierr;
468   PetscMPIInt    size,rank,tag1,tag2;
469   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
470   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
471   PetscBT        *table;
472   MPI_Comm       comm;
473   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
474   MPI_Status     *s_status,*recv_status;
475   MPI_Comm       *iscomms;
476   char           *t_p;
477 
478   PetscFunctionBegin;
479   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
480   size = c->size;
481   rank = c->rank;
482   M    = C->rmap->N;
483 
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
485   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
486 
487   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
488 
489   for (i=0; i<imax; i++) {
490     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
491     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
492   }
493 
494   /* evaluate communication - mesg to who,length of mesg, and buffer space
495      required. Based on this, buffers are allocated, and data copied into them  */
496   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
497   for (i=0; i<imax; i++) {
498     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
499     idx_i = idx[i];
500     len   = n[i];
501     for (j=0; j<len; j++) {
502       row = idx_i[j];
503       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
504       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
505       w4[proc]++;
506     }
507     for (j=0; j<size; j++) {
508       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
509     }
510   }
511 
512   nrqs     = 0;              /* no of outgoing messages */
513   msz      = 0;              /* total mesg length (for all proc */
514   w1[rank] = 0;              /* no mesg sent to intself */
515   w3[rank] = 0;
516   for (i=0; i<size; i++) {
517     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
518   }
519   /* pa - is list of processors to communicate with */
520   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
521   for (i=0,j=0; i<size; i++) {
522     if (w1[i]) {pa[j] = i; j++;}
523   }
524 
525   /* Each message would have a header = 1 + 2*(no of IS) + data */
526   for (i=0; i<nrqs; i++) {
527     j      = pa[i];
528     w1[j] += w2[j] + 2*w3[j];
529     msz   += w1[j];
530   }
531 
532   /* Determine the number of messages to expect, their lengths, from from-ids */
533   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
534   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
535 
536   /* Now post the Irecvs corresponding to these messages */
537   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
538 
539   /* Allocate Memory for outgoing messages */
540   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
541   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
542   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
543 
544   {
545     PetscInt *iptr = tmp,ict  = 0;
546     for (i=0; i<nrqs; i++) {
547       j         = pa[i];
548       iptr     +=  ict;
549       outdat[j] = iptr;
550       ict       = w1[j];
551     }
552   }
553 
554   /* Form the outgoing messages */
555   /* plug in the headers */
556   for (i=0; i<nrqs; i++) {
557     j            = pa[i];
558     outdat[j][0] = 0;
559     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
560     ptr[j]       = outdat[j] + 2*w3[j] + 1;
561   }
562 
563   /* Memory for doing local proc's work */
564   {
565     PetscInt M_BPB_imax = 0;
566 #if defined(PETSC_USE_CTABLE)
567     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
568     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
569     for (i=0; i<imax; i++) {
570       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
571     }
572     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
573     for (i=0; i<imax; i++) {
574       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
575     }
576 #else
577     PetscInt Mimax = 0;
578     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
579     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
580     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
581     for (i=0; i<imax; i++) {
582       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
583       data[i]  = d_p + M*i;
584     }
585 #endif
586   }
587 
588   /* Parse the IS and update local tables and the outgoing buf with the data */
589   {
590     PetscInt n_i,isz_i,*outdat_j,ctr_j;
591     PetscBT  table_i;
592 
593     for (i=0; i<imax; i++) {
594       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
595       n_i     = n[i];
596       table_i = table[i];
597       idx_i   = idx[i];
598 #if defined(PETSC_USE_CTABLE)
599       table_data_i = table_data[i];
600 #else
601       data_i  = data[i];
602 #endif
603       isz_i   = isz[i];
604       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
605         row  = idx_i[j];
606         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
607         if (proc != rank) { /* copy to the outgoing buffer */
608           ctr[proc]++;
609           *ptr[proc] = row;
610           ptr[proc]++;
611         } else if (!PetscBTLookupSet(table_i,row)) {
612 #if defined(PETSC_USE_CTABLE)
613           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
614 #else
615           data_i[isz_i] = row; /* Update the local table */
616 #endif
617           isz_i++;
618         }
619       }
620       /* Update the headers for the current IS */
621       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
622         if ((ctr_j = ctr[j])) {
623           outdat_j        = outdat[j];
624           k               = ++outdat_j[0];
625           outdat_j[2*k]   = ctr_j;
626           outdat_j[2*k-1] = i;
627         }
628       }
629       isz[i] = isz_i;
630     }
631   }
632 
633   /*  Now  post the sends */
634   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
635   for (i=0; i<nrqs; ++i) {
636     j    = pa[i];
637     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
638   }
639 
640   /* No longer need the original indices */
641   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
642   for (i=0; i<imax; ++i) {
643     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
644     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
645   }
646   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
647 
648   for (i=0; i<imax; ++i) {
649     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
650   }
651 
652   /* Do Local work */
653 #if defined(PETSC_USE_CTABLE)
654   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
655 #else
656   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
657 #endif
658 
659   /* Receive messages */
660   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
661   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
662 
663   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
664   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
665 
666   /* Phase 1 sends are complete - deallocate buffers */
667   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
668   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
669 
670   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
671   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
672   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
673   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
674   ierr = PetscFree(rbuf);CHKERRQ(ierr);
675 
676 
677   /* Send the data back */
678   /* Do a global reduction to know the buffer space req for incoming messages */
679   {
680     PetscMPIInt *rw1;
681 
682     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
683 
684     for (i=0; i<nrqr; ++i) {
685       proc = recv_status[i].MPI_SOURCE;
686 
687       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
688       rw1[proc] = isz1[i];
689     }
690     ierr = PetscFree(onodes1);CHKERRQ(ierr);
691     ierr = PetscFree(olengths1);CHKERRQ(ierr);
692 
693     /* Determine the number of messages to expect, their lengths, from from-ids */
694     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
695     ierr = PetscFree(rw1);CHKERRQ(ierr);
696   }
697   /* Now post the Irecvs corresponding to these messages */
698   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
699 
700   /* Now  post the sends */
701   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
702   for (i=0; i<nrqr; ++i) {
703     j    = recv_status[i].MPI_SOURCE;
704     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
705   }
706 
707   /* receive work done on other processors */
708   {
709     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
710     PetscMPIInt idex;
711     PetscBT     table_i;
712     MPI_Status  *status2;
713 
714     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
715     for (i=0; i<nrqs; ++i) {
716       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
717       /* Process the message */
718       rbuf2_i = rbuf2[idex];
719       ct1     = 2*rbuf2_i[0]+1;
720       jmax    = rbuf2[idex][0];
721       for (j=1; j<=jmax; j++) {
722         max     = rbuf2_i[2*j];
723         is_no   = rbuf2_i[2*j-1];
724         isz_i   = isz[is_no];
725         table_i = table[is_no];
726 #if defined(PETSC_USE_CTABLE)
727         table_data_i = table_data[is_no];
728 #else
729         data_i  = data[is_no];
730 #endif
731         for (k=0; k<max; k++,ct1++) {
732           row = rbuf2_i[ct1];
733           if (!PetscBTLookupSet(table_i,row)) {
734 #if defined(PETSC_USE_CTABLE)
735             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
736 #else
737             data_i[isz_i] = row;
738 #endif
739             isz_i++;
740           }
741         }
742         isz[is_no] = isz_i;
743       }
744     }
745 
746     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
747     ierr = PetscFree(status2);CHKERRQ(ierr);
748   }
749 
750 #if defined(PETSC_USE_CTABLE)
751   tcount_max = 0;
752   for (i=0; i<imax; ++i) {
753     table_data_i = table_data[i];
754     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
755     if (tcount_max < tcount) tcount_max = tcount;
756   }
757   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
758 #endif
759 
760   for (i=0; i<imax; ++i) {
761 #if defined(PETSC_USE_CTABLE)
762     PetscTablePosition tpos;
763     table_data_i = table_data[i];
764 
765     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
766     while (tpos) {
767       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
768       tdata[--j] = --k;
769     }
770     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
771 #else
772     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
773 #endif
774     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
775   }
776 
777   ierr = PetscFree(iscomms);CHKERRQ(ierr);
778   ierr = PetscFree(onodes2);CHKERRQ(ierr);
779   ierr = PetscFree(olengths2);CHKERRQ(ierr);
780 
781   ierr = PetscFree(pa);CHKERRQ(ierr);
782   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
783   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
784   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
785   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
786   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
787   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
788   ierr = PetscFree(s_status);CHKERRQ(ierr);
789   ierr = PetscFree(recv_status);CHKERRQ(ierr);
790   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
791   ierr = PetscFree(xdata);CHKERRQ(ierr);
792   ierr = PetscFree(isz1);CHKERRQ(ierr);
793 #if defined(PETSC_USE_CTABLE)
794   for (i=0; i<imax; i++) {
795     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
796   }
797   ierr = PetscFree(table_data);CHKERRQ(ierr);
798   ierr = PetscFree(tdata);CHKERRQ(ierr);
799   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
800 #else
801   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
802 #endif
803   PetscFunctionReturn(0);
804 }
805 
806 /*
807    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
808        the work on the local processor.
809 
810      Inputs:
811       C      - MAT_MPIAIJ;
812       imax - total no of index sets processed at a time;
813       table  - an array of char - size = m bits.
814 
815      Output:
816       isz    - array containing the count of the solution elements corresponding
817                to each index set;
818       data or table_data  - pointer to the solutions
819 */
820 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
821 {
822   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
823   Mat        A  = c->A,B = c->B;
824   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
825   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
826   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
827   PetscBT    table_i;
828 #if defined(PETSC_USE_CTABLE)
829   PetscTable         table_data_i;
830   PetscErrorCode     ierr;
831   PetscTablePosition tpos;
832   PetscInt           tcount,*tdata;
833 #else
834   PetscInt           *data_i;
835 #endif
836 
837   PetscFunctionBegin;
838   rstart = C->rmap->rstart;
839   cstart = C->cmap->rstart;
840   ai     = a->i;
841   aj     = a->j;
842   bi     = b->i;
843   bj     = b->j;
844   garray = c->garray;
845 
846   for (i=0; i<imax; i++) {
847 #if defined(PETSC_USE_CTABLE)
848     /* copy existing entries of table_data_i into tdata[] */
849     table_data_i = table_data[i];
850     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
851     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
852 
853     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
854     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
855     while (tpos) {
856       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
857       tdata[--j] = --row;
858       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
859     }
860 #else
861     data_i  = data[i];
862 #endif
863     table_i = table[i];
864     isz_i   = isz[i];
865     max     = isz[i];
866 
867     for (j=0; j<max; j++) {
868 #if defined(PETSC_USE_CTABLE)
869       row   = tdata[j] - rstart;
870 #else
871       row   = data_i[j] - rstart;
872 #endif
873       start = ai[row];
874       end   = ai[row+1];
875       for (k=start; k<end; k++) { /* Amat */
876         val = aj[k] + cstart;
877         if (!PetscBTLookupSet(table_i,val)) {
878 #if defined(PETSC_USE_CTABLE)
879           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
880 #else
881           data_i[isz_i] = val;
882 #endif
883           isz_i++;
884         }
885       }
886       start = bi[row];
887       end   = bi[row+1];
888       for (k=start; k<end; k++) { /* Bmat */
889         val = garray[bj[k]];
890         if (!PetscBTLookupSet(table_i,val)) {
891 #if defined(PETSC_USE_CTABLE)
892           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
893 #else
894           data_i[isz_i] = val;
895 #endif
896           isz_i++;
897         }
898       }
899     }
900     isz[i] = isz_i;
901 
902 #if defined(PETSC_USE_CTABLE)
903     ierr = PetscFree(tdata);CHKERRQ(ierr);
904 #endif
905   }
906   PetscFunctionReturn(0);
907 }
908 
909 /*
910       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
911          and return the output
912 
913          Input:
914            C    - the matrix
915            nrqr - no of messages being processed.
916            rbuf - an array of pointers to the recieved requests
917 
918          Output:
919            xdata - array of messages to be sent back
920            isz1  - size of each message
921 
922   For better efficiency perhaps we should malloc separately each xdata[i],
923 then if a remalloc is required we need only copy the data for that one row
924 rather then all previous rows as it is now where a single large chunck of
925 memory is used.
926 
927 */
928 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
929 {
930   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
931   Mat            A  = c->A,B = c->B;
932   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
933   PetscErrorCode ierr;
934   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
935   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
936   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
937   PetscInt       *rbuf_i,kmax,rbuf_0;
938   PetscBT        xtable;
939 
940   PetscFunctionBegin;
941   m      = C->rmap->N;
942   rstart = C->rmap->rstart;
943   cstart = C->cmap->rstart;
944   ai     = a->i;
945   aj     = a->j;
946   bi     = b->i;
947   bj     = b->j;
948   garray = c->garray;
949 
950 
951   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
952     rbuf_i =  rbuf[i];
953     rbuf_0 =  rbuf_i[0];
954     ct    += rbuf_0;
955     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
956   }
957 
958   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
959   else max1 = 1;
960   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
961   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
962   ++no_malloc;
963   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
964   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
965 
966   ct3 = 0;
967   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
968     rbuf_i =  rbuf[i];
969     rbuf_0 =  rbuf_i[0];
970     ct1    =  2*rbuf_0+1;
971     ct2    =  ct1;
972     ct3   += ct1;
973     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
974       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
975       oct2 = ct2;
976       kmax = rbuf_i[2*j];
977       for (k=0; k<kmax; k++,ct1++) {
978         row = rbuf_i[ct1];
979         if (!PetscBTLookupSet(xtable,row)) {
980           if (!(ct3 < mem_estimate)) {
981             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
982             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
983             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
984             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
985             xdata[0]     = tmp;
986             mem_estimate = new_estimate; ++no_malloc;
987             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
988           }
989           xdata[i][ct2++] = row;
990           ct3++;
991         }
992       }
993       for (k=oct2,max2=ct2; k<max2; k++) {
994         row   = xdata[i][k] - rstart;
995         start = ai[row];
996         end   = ai[row+1];
997         for (l=start; l<end; l++) {
998           val = aj[l] + cstart;
999           if (!PetscBTLookupSet(xtable,val)) {
1000             if (!(ct3 < mem_estimate)) {
1001               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1002               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1003               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1004               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1005               xdata[0]     = tmp;
1006               mem_estimate = new_estimate; ++no_malloc;
1007               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1008             }
1009             xdata[i][ct2++] = val;
1010             ct3++;
1011           }
1012         }
1013         start = bi[row];
1014         end   = bi[row+1];
1015         for (l=start; l<end; l++) {
1016           val = garray[bj[l]];
1017           if (!PetscBTLookupSet(xtable,val)) {
1018             if (!(ct3 < mem_estimate)) {
1019               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1020               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1021               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1022               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1023               xdata[0]     = tmp;
1024               mem_estimate = new_estimate; ++no_malloc;
1025               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1026             }
1027             xdata[i][ct2++] = val;
1028             ct3++;
1029           }
1030         }
1031       }
1032       /* Update the header*/
1033       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1034       xdata[i][2*j-1] = rbuf_i[2*j-1];
1035     }
1036     xdata[i][0] = rbuf_0;
1037     xdata[i+1]  = xdata[i] + ct2;
1038     isz1[i]     = ct2; /* size of each message */
1039   }
1040   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1041   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1042   PetscFunctionReturn(0);
1043 }
1044 /* -------------------------------------------------------------------------*/
1045 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1046 /*
1047     Every processor gets the entire matrix
1048 */
1049 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1050 {
1051   Mat            B;
1052   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1053   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1054   PetscErrorCode ierr;
1055   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1056   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1057   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1058   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1059 
1060   PetscFunctionBegin;
1061   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1062   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1063 
1064   if (scall == MAT_INITIAL_MATRIX) {
1065     /* ----------------------------------------------------------------
1066          Tell every processor the number of nonzeros per row
1067     */
1068     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1069     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1070       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1071     }
1072     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1073     for (i=0; i<size; i++) {
1074       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1075       displs[i]     = A->rmap->range[i];
1076     }
1077 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1078     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1079 #else
1080     sendcount = A->rmap->rend - A->rmap->rstart;
1081     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1082 #endif
1083     /* ---------------------------------------------------------------
1084          Create the sequential matrix of the same type as the local block diagonal
1085     */
1086     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1087     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1088     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1089     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1090     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1091     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1092     **Bin = B;
1093     b     = (Mat_SeqAIJ*)B->data;
1094 
1095     /*--------------------------------------------------------------------
1096        Copy my part of matrix column indices over
1097     */
1098     sendcount  = ad->nz + bd->nz;
1099     jsendbuf   = b->j + b->i[rstarts[rank]];
1100     a_jsendbuf = ad->j;
1101     b_jsendbuf = bd->j;
1102     n          = A->rmap->rend - A->rmap->rstart;
1103     cnt        = 0;
1104     for (i=0; i<n; i++) {
1105 
1106       /* put in lower diagonal portion */
1107       m = bd->i[i+1] - bd->i[i];
1108       while (m > 0) {
1109         /* is it above diagonal (in bd (compressed) numbering) */
1110         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1111         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1112         m--;
1113       }
1114 
1115       /* put in diagonal portion */
1116       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1117         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1118       }
1119 
1120       /* put in upper diagonal portion */
1121       while (m-- > 0) {
1122         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1123       }
1124     }
1125     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1126 
1127     /*--------------------------------------------------------------------
1128        Gather all column indices to all processors
1129     */
1130     for (i=0; i<size; i++) {
1131       recvcounts[i] = 0;
1132       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1133         recvcounts[i] += lens[j];
1134       }
1135     }
1136     displs[0] = 0;
1137     for (i=1; i<size; i++) {
1138       displs[i] = displs[i-1] + recvcounts[i-1];
1139     }
1140 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1141     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1142 #else
1143     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1144 #endif
1145     /*--------------------------------------------------------------------
1146         Assemble the matrix into useable form (note numerical values not yet set)
1147     */
1148     /* set the b->ilen (length of each row) values */
1149     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1150     /* set the b->i indices */
1151     b->i[0] = 0;
1152     for (i=1; i<=A->rmap->N; i++) {
1153       b->i[i] = b->i[i-1] + lens[i-1];
1154     }
1155     ierr = PetscFree(lens);CHKERRQ(ierr);
1156     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1157     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1158 
1159   } else {
1160     B = **Bin;
1161     b = (Mat_SeqAIJ*)B->data;
1162   }
1163 
1164   /*--------------------------------------------------------------------
1165        Copy my part of matrix numerical values into the values location
1166   */
1167   if (flag == MAT_GET_VALUES) {
1168     sendcount = ad->nz + bd->nz;
1169     sendbuf   = b->a + b->i[rstarts[rank]];
1170     a_sendbuf = ad->a;
1171     b_sendbuf = bd->a;
1172     b_sendj   = bd->j;
1173     n         = A->rmap->rend - A->rmap->rstart;
1174     cnt       = 0;
1175     for (i=0; i<n; i++) {
1176 
1177       /* put in lower diagonal portion */
1178       m = bd->i[i+1] - bd->i[i];
1179       while (m > 0) {
1180         /* is it above diagonal (in bd (compressed) numbering) */
1181         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1182         sendbuf[cnt++] = *b_sendbuf++;
1183         m--;
1184         b_sendj++;
1185       }
1186 
1187       /* put in diagonal portion */
1188       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1189         sendbuf[cnt++] = *a_sendbuf++;
1190       }
1191 
1192       /* put in upper diagonal portion */
1193       while (m-- > 0) {
1194         sendbuf[cnt++] = *b_sendbuf++;
1195         b_sendj++;
1196       }
1197     }
1198     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1199 
1200     /* -----------------------------------------------------------------
1201        Gather all numerical values to all processors
1202     */
1203     if (!recvcounts) {
1204       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1205     }
1206     for (i=0; i<size; i++) {
1207       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1208     }
1209     displs[0] = 0;
1210     for (i=1; i<size; i++) {
1211       displs[i] = displs[i-1] + recvcounts[i-1];
1212     }
1213     recvbuf = b->a;
1214 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1215     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1216 #else
1217     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1218 #endif
1219   }  /* endof (flag == MAT_GET_VALUES) */
1220   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1221 
1222   if (A->symmetric) {
1223     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1224   } else if (A->hermitian) {
1225     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1226   } else if (A->structurally_symmetric) {
1227     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1228   }
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1233 {
1234   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1235   Mat            submat,A = c->A,B = c->B;
1236   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1237   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1238   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1239   const PetscInt *icol,*irow;
1240   PetscInt       nrow,ncol,start;
1241   PetscErrorCode ierr;
1242   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1243   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1244   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1245   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1246   PetscInt       *lens,rmax,ncols,*cols,Crow;
1247 #if defined(PETSC_USE_CTABLE)
1248   PetscTable     cmap,rmap;
1249   PetscInt       *cmap_loc,*rmap_loc;
1250 #else
1251   PetscInt       *cmap,*rmap;
1252 #endif
1253   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1254   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1255   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1256   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1257   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1258   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1259   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1260   MPI_Comm       comm;
1261   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1262   PetscMPIInt    *onodes1,*olengths1,idex,end;
1263   Mat_SubSppt    *smatis1;
1264   PetscBool      isrowsorted,iscolsorted;
1265 
1266   PetscFunctionBegin;
1267   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1268 
1269   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1270   size = c->size;
1271   rank = c->rank;
1272 
1273   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1274   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1275   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1276   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1277   if (allcolumns) {
1278     icol = NULL;
1279     ncol = C->cmap->N;
1280   } else {
1281     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1282     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1283   }
1284 
1285   if (scall == MAT_INITIAL_MATRIX) {
1286     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1287 
1288     /* Get some new tags to keep the communication clean */
1289     tag1 = ((PetscObject)C)->tag;
1290     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1291     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1292 
1293     /* evaluate communication - mesg to who, length of mesg, and buffer space
1294      required. Based on this, buffers are allocated, and data copied into them */
1295     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1296     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1297 
1298     /* w1[proc] = num of rows owned by proc -- to be requested */
1299     proc = 0;
1300     nrqs = 0; /* num of outgoing messages */
1301     for (j=0; j<nrow; j++) {
1302       row  = irow[j];
1303       if (!isrowsorted) proc = 0;
1304       while (row >= C->rmap->range[proc+1]) proc++;
1305       w1[proc]++;
1306       row2proc[j] = proc; /* map row index to proc */
1307 
1308       if (proc != rank && !w2[proc]) {
1309         w2[proc] = 1; nrqs++;
1310       }
1311     }
1312     w1[rank] = 0;  /* rows owned by self will not be requested */
1313 
1314     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1315     for (proc=0,j=0; proc<size; proc++) {
1316       if (w1[proc]) { pa[j++] = proc;}
1317     }
1318 
1319     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1320     msz = 0;              /* total mesg length (for all procs) */
1321     for (i=0; i<nrqs; i++) {
1322       proc      = pa[i];
1323       w1[proc] += 3;
1324       msz      += w1[proc];
1325     }
1326     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1327 
1328     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1329     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1330     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1331 
1332     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1333        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1334     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1335 
1336     /* Now post the Irecvs corresponding to these messages */
1337     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1338 
1339     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1340     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1341 
1342     /* Allocate Memory for outgoing messages */
1343     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1344     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1345     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1346 
1347     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1348     iptr = tmp;
1349     for (i=0; i<nrqs; i++) {
1350       proc        = pa[i];
1351       sbuf1[proc] = iptr;
1352       iptr       += w1[proc];
1353     }
1354 
1355     /* Form the outgoing messages */
1356     /* Initialize the header space */
1357     for (i=0; i<nrqs; i++) {
1358       proc      = pa[i];
1359       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1360       ptr[proc] = sbuf1[proc] + 3;
1361     }
1362 
1363     /* Parse the isrow and copy data into outbuf */
1364     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1365     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1366       proc = row2proc[j];
1367       if (proc != rank) { /* copy to the outgoing buf*/
1368         *ptr[proc] = irow[j];
1369         ctr[proc]++; ptr[proc]++;
1370       }
1371     }
1372 
1373     /* Update the headers for the current IS */
1374     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1375       if ((ctr_j = ctr[j])) {
1376         sbuf1_j        = sbuf1[j];
1377         k              = ++sbuf1_j[0];
1378         sbuf1_j[2*k]   = ctr_j;
1379         sbuf1_j[2*k-1] = 0;
1380       }
1381     }
1382 
1383     /* Now post the sends */
1384     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1385     for (i=0; i<nrqs; ++i) {
1386       proc = pa[i];
1387       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1388     }
1389 
1390     /* Post Receives to capture the buffer size */
1391     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1392     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1393 
1394     rbuf2[0] = tmp + msz;
1395     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1396 
1397     for (i=0; i<nrqs; ++i) {
1398       proc = pa[i];
1399       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1400     }
1401 
1402     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1403 
1404     /* Send to other procs the buf size they should allocate */
1405     /* Receive messages*/
1406     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1407     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1408 
1409     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1410     for (i=0; i<nrqr; ++i) {
1411       req_size[i] = 0;
1412       rbuf1_i        = rbuf1[i];
1413       start          = 2*rbuf1_i[0] + 1;
1414       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1415       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1416       sbuf2_i        = sbuf2[i];
1417       for (j=start; j<end; j++) {
1418         k            = rbuf1_i[j] - rstart;
1419         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1420         sbuf2_i[j]   = ncols;
1421         req_size[i] += ncols;
1422       }
1423       req_source1[i] = r_status1[i].MPI_SOURCE;
1424 
1425       /* form the header */
1426       sbuf2_i[0] = req_size[i];
1427       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1428 
1429       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1430     }
1431 
1432     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1433     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1434 
1435     /* rbuf2 is received, Post recv column indices a->j */
1436     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1437 
1438     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1439     for (i=0; i<nrqs; ++i) {
1440       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1441       req_source2[i] = r_status2[i].MPI_SOURCE;
1442       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1443     }
1444 
1445     /* Wait on sends1 and sends2 */
1446     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1447     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1448     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1449     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1450 
1451     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1452     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1453 
1454     /* Now allocate sending buffers for a->j, and send them off */
1455     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1456     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1457     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1458     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1459 
1460     for (i=0; i<nrqr; i++) { /* for each requested message */
1461       rbuf1_i   = rbuf1[i];
1462       sbuf_aj_i = sbuf_aj[i];
1463       ct1       = 2*rbuf1_i[0] + 1;
1464       ct2       = 0;
1465       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1466 
1467       kmax = rbuf1[i][2];
1468       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1469         row    = rbuf1_i[ct1] - rstart;
1470         nzA    = ai[row+1] - ai[row];
1471         nzB    = bi[row+1] - bi[row];
1472         ncols  = nzA + nzB;
1473         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1474 
1475         /* load the column indices for this row into cols*/
1476         cols = sbuf_aj_i + ct2;
1477 
1478         lwrite = 0;
1479         for (l=0; l<nzB; l++) {
1480           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1481         }
1482         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1483         for (l=0; l<nzB; l++) {
1484           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1485         }
1486 
1487         ct2 += ncols;
1488       }
1489       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1490     }
1491 
1492     /* create column map (cmap): global col of C -> local col of submat */
1493 #if defined(PETSC_USE_CTABLE)
1494     if (!allcolumns) {
1495       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1496       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1497       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1498         if (icol[j] >= cstart && icol[j] <cend) {
1499           cmap_loc[icol[j] - cstart] = j+1;
1500         } else { /* use PetscTable for non-local col indices */
1501           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1502         }
1503       }
1504     } else {
1505       cmap     = NULL;
1506       cmap_loc = NULL;
1507     }
1508     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1509 #else
1510     if (!allcolumns) {
1511       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1512       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1513     } else {
1514       cmap = NULL;
1515     }
1516 #endif
1517 
1518     /* Create lens for MatSeqAIJSetPreallocation() */
1519     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1520 
1521     /* Compute lens from local part of C */
1522     for (j=0; j<nrow; j++) {
1523       row  = irow[j];
1524       proc = row2proc[j];
1525       if (proc == rank) {
1526         /* diagonal part A = c->A */
1527         ncols = ai[row-rstart+1] - ai[row-rstart];
1528         cols  = aj + ai[row-rstart];
1529         if (!allcolumns) {
1530           for (k=0; k<ncols; k++) {
1531 #if defined(PETSC_USE_CTABLE)
1532             tcol = cmap_loc[cols[k]];
1533 #else
1534             tcol = cmap[cols[k]+cstart];
1535 #endif
1536             if (tcol) lens[j]++;
1537           }
1538         } else { /* allcolumns */
1539           lens[j] = ncols;
1540         }
1541 
1542         /* off-diagonal part B = c->B */
1543         ncols = bi[row-rstart+1] - bi[row-rstart];
1544         cols  = bj + bi[row-rstart];
1545         if (!allcolumns) {
1546           for (k=0; k<ncols; k++) {
1547 #if defined(PETSC_USE_CTABLE)
1548             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1549 #else
1550             tcol = cmap[bmap[cols[k]]];
1551 #endif
1552             if (tcol) lens[j]++;
1553           }
1554         } else { /* allcolumns */
1555           lens[j] += ncols;
1556         }
1557       }
1558     }
1559 
1560     /* Create row map (rmap): global row of C -> local row of submat */
1561 #if defined(PETSC_USE_CTABLE)
1562     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1563     for (j=0; j<nrow; j++) {
1564       row  = irow[j];
1565       proc = row2proc[j];
1566       if (proc == rank) { /* a local row */
1567         rmap_loc[row - rstart] = j;
1568       } else {
1569         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1570       }
1571     }
1572 #else
1573     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1574     for (j=0; j<nrow; j++) {
1575       rmap[irow[j]] = j;
1576     }
1577 #endif
1578 
1579     /* Update lens from offproc data */
1580     /* recv a->j is done */
1581     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1582     for (i=0; i<nrqs; i++) {
1583       proc    = pa[i];
1584       sbuf1_i = sbuf1[proc];
1585       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1586       ct1     = 2 + 1;
1587       ct2     = 0;
1588       rbuf2_i = rbuf2[i]; /* received length of C->j */
1589       rbuf3_i = rbuf3[i]; /* received C->j */
1590 
1591       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1592       max1   = sbuf1_i[2];
1593       for (k=0; k<max1; k++,ct1++) {
1594 #if defined(PETSC_USE_CTABLE)
1595         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1596         row--;
1597         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1598 #else
1599         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1600 #endif
1601         /* Now, store row index of submat in sbuf1_i[ct1] */
1602         sbuf1_i[ct1] = row;
1603 
1604         nnz = rbuf2_i[ct1];
1605         if (!allcolumns) {
1606           for (l=0; l<nnz; l++,ct2++) {
1607 #if defined(PETSC_USE_CTABLE)
1608             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1609               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1610             } else {
1611               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1612             }
1613 #else
1614             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1615 #endif
1616             if (tcol) lens[row]++;
1617           }
1618         } else { /* allcolumns */
1619           lens[row] += nnz;
1620         }
1621       }
1622     }
1623     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1624     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1625 
1626     /* Create the submatrices */
1627     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1628     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1629 
1630     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1631     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1632     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1633     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1634     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1635 
1636     /* create struct Mat_SubSppt and attached it to submat */
1637     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1638     subc = (Mat_SeqAIJ*)submat->data;
1639     subc->submatis1 = smatis1;
1640 
1641     smatis1->id          = 0;
1642     smatis1->nrqs        = nrqs;
1643     smatis1->nrqr        = nrqr;
1644     smatis1->rbuf1       = rbuf1;
1645     smatis1->rbuf2       = rbuf2;
1646     smatis1->rbuf3       = rbuf3;
1647     smatis1->sbuf2       = sbuf2;
1648     smatis1->req_source2 = req_source2;
1649 
1650     smatis1->sbuf1       = sbuf1;
1651     smatis1->ptr         = ptr;
1652     smatis1->tmp         = tmp;
1653     smatis1->ctr         = ctr;
1654 
1655     smatis1->pa           = pa;
1656     smatis1->req_size     = req_size;
1657     smatis1->req_source1  = req_source1;
1658 
1659     smatis1->allcolumns  = allcolumns;
1660     smatis1->singleis    = PETSC_TRUE;
1661     smatis1->row2proc    = row2proc;
1662     smatis1->rmap        = rmap;
1663     smatis1->cmap        = cmap;
1664 #if defined(PETSC_USE_CTABLE)
1665     smatis1->rmap_loc    = rmap_loc;
1666     smatis1->cmap_loc    = cmap_loc;
1667 #endif
1668 
1669     smatis1->destroy     = submat->ops->destroy;
1670     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1671     submat->factortype   = C->factortype;
1672 
1673     /* compute rmax */
1674     rmax = 0;
1675     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1676 
1677   } else { /* scall == MAT_REUSE_MATRIX */
1678     submat = submats[0];
1679     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1680 
1681     subc    = (Mat_SeqAIJ*)submat->data;
1682     rmax    = subc->rmax;
1683     smatis1 = subc->submatis1;
1684     nrqs        = smatis1->nrqs;
1685     nrqr        = smatis1->nrqr;
1686     rbuf1       = smatis1->rbuf1;
1687     rbuf2       = smatis1->rbuf2;
1688     rbuf3       = smatis1->rbuf3;
1689     req_source2 = smatis1->req_source2;
1690 
1691     sbuf1     = smatis1->sbuf1;
1692     sbuf2     = smatis1->sbuf2;
1693     ptr       = smatis1->ptr;
1694     tmp       = smatis1->tmp;
1695     ctr       = smatis1->ctr;
1696 
1697     pa         = smatis1->pa;
1698     req_size   = smatis1->req_size;
1699     req_source1 = smatis1->req_source1;
1700 
1701     allcolumns = smatis1->allcolumns;
1702     row2proc   = smatis1->row2proc;
1703     rmap       = smatis1->rmap;
1704     cmap       = smatis1->cmap;
1705 #if defined(PETSC_USE_CTABLE)
1706     rmap_loc   = smatis1->rmap_loc;
1707     cmap_loc   = smatis1->cmap_loc;
1708 #endif
1709   }
1710 
1711   /* Post recv matrix values */
1712   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1713   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1714   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1715   for (i=0; i<nrqs; ++i) {
1716     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1717     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1718   }
1719 
1720   /* Allocate sending buffers for a->a, and send them off */
1721   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1722   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1723   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1724   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1725 
1726   for (i=0; i<nrqr; i++) {
1727     rbuf1_i   = rbuf1[i];
1728     sbuf_aa_i = sbuf_aa[i];
1729     ct1       = 2*rbuf1_i[0]+1;
1730     ct2       = 0;
1731     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1732 
1733     kmax = rbuf1_i[2];
1734     for (k=0; k<kmax; k++,ct1++) {
1735       row = rbuf1_i[ct1] - rstart;
1736       nzA = ai[row+1] - ai[row];
1737       nzB = bi[row+1] - bi[row];
1738       ncols  = nzA + nzB;
1739       cworkB = bj + bi[row];
1740       vworkA = a_a + ai[row];
1741       vworkB = b_a + bi[row];
1742 
1743       /* load the column values for this row into vals*/
1744       vals = sbuf_aa_i + ct2;
1745 
1746       lwrite = 0;
1747       for (l=0; l<nzB; l++) {
1748         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1749       }
1750       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1751       for (l=0; l<nzB; l++) {
1752         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1753       }
1754 
1755       ct2 += ncols;
1756     }
1757     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1758   }
1759 
1760   /* Assemble submat */
1761   /* First assemble the local rows */
1762   for (j=0; j<nrow; j++) {
1763     row  = irow[j];
1764     proc = row2proc[j];
1765     if (proc == rank) {
1766       Crow = row - rstart;  /* local row index of C */
1767 #if defined(PETSC_USE_CTABLE)
1768       row = rmap_loc[Crow]; /* row index of submat */
1769 #else
1770       row = rmap[row];
1771 #endif
1772 
1773       if (allcolumns) {
1774         /* diagonal part A = c->A */
1775         ncols = ai[Crow+1] - ai[Crow];
1776         cols  = aj + ai[Crow];
1777         vals  = a->a + ai[Crow];
1778         i     = 0;
1779         for (k=0; k<ncols; k++) {
1780           subcols[i]   = cols[k] + cstart;
1781           subvals[i++] = vals[k];
1782         }
1783 
1784         /* off-diagonal part B = c->B */
1785         ncols = bi[Crow+1] - bi[Crow];
1786         cols  = bj + bi[Crow];
1787         vals  = b->a + bi[Crow];
1788         for (k=0; k<ncols; k++) {
1789           subcols[i]   = bmap[cols[k]];
1790           subvals[i++] = vals[k];
1791         }
1792 
1793         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1794 
1795       } else { /* !allcolumns */
1796 #if defined(PETSC_USE_CTABLE)
1797         /* diagonal part A = c->A */
1798         ncols = ai[Crow+1] - ai[Crow];
1799         cols  = aj + ai[Crow];
1800         vals  = a->a + ai[Crow];
1801         i     = 0;
1802         for (k=0; k<ncols; k++) {
1803           tcol = cmap_loc[cols[k]];
1804           if (tcol) {
1805             subcols[i]   = --tcol;
1806             subvals[i++] = vals[k];
1807           }
1808         }
1809 
1810         /* off-diagonal part B = c->B */
1811         ncols = bi[Crow+1] - bi[Crow];
1812         cols  = bj + bi[Crow];
1813         vals  = b->a + bi[Crow];
1814         for (k=0; k<ncols; k++) {
1815           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1816           if (tcol) {
1817             subcols[i]   = --tcol;
1818             subvals[i++] = vals[k];
1819           }
1820         }
1821 #else
1822         /* diagonal part A = c->A */
1823         ncols = ai[Crow+1] - ai[Crow];
1824         cols  = aj + ai[Crow];
1825         vals  = a->a + ai[Crow];
1826         i     = 0;
1827         for (k=0; k<ncols; k++) {
1828           tcol = cmap[cols[k]+cstart];
1829           if (tcol) {
1830             subcols[i]   = --tcol;
1831             subvals[i++] = vals[k];
1832           }
1833         }
1834 
1835         /* off-diagonal part B = c->B */
1836         ncols = bi[Crow+1] - bi[Crow];
1837         cols  = bj + bi[Crow];
1838         vals  = b->a + bi[Crow];
1839         for (k=0; k<ncols; k++) {
1840           tcol = cmap[bmap[cols[k]]];
1841           if (tcol) {
1842             subcols[i]   = --tcol;
1843             subvals[i++] = vals[k];
1844           }
1845         }
1846 #endif
1847         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1848       }
1849     }
1850   }
1851 
1852   /* Now assemble the off-proc rows */
1853   for (i=0; i<nrqs; i++) { /* for each requested message */
1854     /* recv values from other processes */
1855     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1856     proc    = pa[idex];
1857     sbuf1_i = sbuf1[proc];
1858     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1859     ct1     = 2 + 1;
1860     ct2     = 0; /* count of received C->j */
1861     ct3     = 0; /* count of received C->j that will be inserted into submat */
1862     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1863     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1864     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1865 
1866     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1867     max1 = sbuf1_i[2];             /* num of rows */
1868     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1869       row = sbuf1_i[ct1]; /* row index of submat */
1870       if (!allcolumns) {
1871         idex = 0;
1872         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1873           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1874           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1875 #if defined(PETSC_USE_CTABLE)
1876             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1877               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1878             } else {
1879               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1880             }
1881 #else
1882             tcol = cmap[rbuf3_i[ct2]];
1883 #endif
1884             if (tcol) {
1885               subcols[idex]   = --tcol; /* may not be sorted */
1886               subvals[idex++] = rbuf4_i[ct2];
1887 
1888               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1889                For reuse, we replace received C->j with index that should be inserted to submat */
1890               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1891             }
1892           }
1893           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1894         } else { /* scall == MAT_REUSE_MATRIX */
1895           submat = submats[0];
1896           subc   = (Mat_SeqAIJ*)submat->data;
1897 
1898           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1899           for (l=0; l<nnz; l++) {
1900             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1901             subvals[idex++] = rbuf4_i[ct2];
1902           }
1903 
1904           bj = subc->j + subc->i[row]; /* sorted column indices */
1905           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1906         }
1907       } else { /* allcolumns */
1908         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1909         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1910         ct2 += nnz;
1911       }
1912     }
1913   }
1914 
1915   /* sending a->a are done */
1916   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1917   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1918 
1919   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1920   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1921   submats[0] = submat;
1922 
1923   /* Restore the indices */
1924   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1925   if (!allcolumns) {
1926     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1927   }
1928 
1929   /* Destroy allocated memory */
1930   for (i=0; i<nrqs; ++i) {
1931     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1932   }
1933   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1934   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1935   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1936 
1937   if (scall == MAT_INITIAL_MATRIX) {
1938     ierr = PetscFree(lens);CHKERRQ(ierr);
1939     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1940     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1941   }
1942   PetscFunctionReturn(0);
1943 }
1944 
1945 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1946 {
1947   PetscErrorCode ierr;
1948   PetscInt       ncol;
1949   PetscBool      colflag,allcolumns=PETSC_FALSE;
1950 
1951   PetscFunctionBegin;
1952   /* Allocate memory to hold all the submatrices */
1953   if (scall == MAT_INITIAL_MATRIX) {
1954     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1955   }
1956 
1957   /* Check for special case: each processor gets entire matrix columns */
1958   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1959   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1960   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1961 
1962   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1963   PetscFunctionReturn(0);
1964 }
1965 
1966 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1967 {
1968   PetscErrorCode ierr;
1969   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1970   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1971   Mat_SeqAIJ     *subc;
1972   Mat_SubSppt    *smat;
1973 
1974   PetscFunctionBegin;
1975   /* Check for special case: each processor has a single IS */
1976   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1977     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1978     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1979     PetscFunctionReturn(0);
1980   }
1981 
1982   /* Collect global wantallmatrix and nstages */
1983   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1984   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1985   if (!nmax) nmax = 1;
1986 
1987   if (scall == MAT_INITIAL_MATRIX) {
1988     /* Collect global wantallmatrix and nstages */
1989     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1990       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1991       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1992       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1993       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1994       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1995         wantallmatrix = PETSC_TRUE;
1996 
1997         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1998       }
1999     }
2000 
2001     /* Determine the number of stages through which submatrices are done
2002        Each stage will extract nmax submatrices.
2003        nmax is determined by the matrix column dimension.
2004        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2005     */
2006     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2007 
2008     in[0] = -1*(PetscInt)wantallmatrix;
2009     in[1] = nstages;
2010     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2011     wantallmatrix = (PetscBool)(-out[0]);
2012     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2013 
2014   } else { /* MAT_REUSE_MATRIX */
2015     if (ismax) {
2016       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2017       smat = subc->submatis1;
2018     } else { /* (*submat)[0] is a dummy matrix */
2019       smat = (Mat_SubSppt*)(*submat)[0]->data;
2020     }
2021     if (!smat) {
2022       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2023       wantallmatrix = PETSC_TRUE;
2024     } else if (smat->singleis) {
2025       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2026       PetscFunctionReturn(0);
2027     } else {
2028       nstages = smat->nstages;
2029     }
2030   }
2031 
2032   if (wantallmatrix) {
2033     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2034     PetscFunctionReturn(0);
2035   }
2036 
2037   /* Allocate memory to hold all the submatrices and dummy submatrices */
2038   if (scall == MAT_INITIAL_MATRIX) {
2039     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2040   }
2041 
2042   for (i=0,pos=0; i<nstages; i++) {
2043     if (pos+nmax <= ismax) max_no = nmax;
2044     else if (pos >= ismax) max_no = 0;
2045     else                   max_no = ismax-pos;
2046 
2047     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2048     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2049       smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2050       smat->nstages = nstages;
2051     }
2052     pos += max_no;
2053   }
2054 
2055   if (ismax && scall == MAT_INITIAL_MATRIX) {
2056     /* save nstages for reuse */
2057     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2058     smat = subc->submatis1;
2059     smat->nstages = nstages;
2060   }
2061   PetscFunctionReturn(0);
2062 }
2063 
2064 /* -------------------------------------------------------------------------*/
2065 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2066 {
2067   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2068   Mat            A  = c->A;
2069   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2070   const PetscInt **icol,**irow;
2071   PetscInt       *nrow,*ncol,start;
2072   PetscErrorCode ierr;
2073   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2074   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2075   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2076   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2077   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2078 #if defined(PETSC_USE_CTABLE)
2079   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2080 #else
2081   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2082 #endif
2083   const PetscInt *irow_i;
2084   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2085   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2086   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2087   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2088   MPI_Status     *r_status3,*r_status4,*s_status4;
2089   MPI_Comm       comm;
2090   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2091   PetscMPIInt    *onodes1,*olengths1,end;
2092   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2093   Mat_SubSppt    *smat_i;
2094   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2095   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2096 
2097   PetscFunctionBegin;
2098   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2099   size = c->size;
2100   rank = c->rank;
2101 
2102   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2103   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2104 
2105   for (i=0; i<ismax; i++) {
2106     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2107     if (!issorted[i]) iscsorted = issorted[i];
2108 
2109     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2110 
2111     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2112     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2113 
2114     /* Check for special case: allcolumn */
2115     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2116     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2117     if (colflag && ncol[i] == C->cmap->N) {
2118       allcolumns[i] = PETSC_TRUE;
2119       icol[i] = NULL;
2120     } else {
2121       allcolumns[i] = PETSC_FALSE;
2122       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2123     }
2124   }
2125 
2126   if (scall == MAT_REUSE_MATRIX) {
2127     /* Assumes new rows are same length as the old rows */
2128     for (i=0; i<ismax; i++) {
2129       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2130       subc = (Mat_SeqAIJ*)submats[i]->data;
2131       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2132 
2133       /* Initial matrix as if empty */
2134       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2135 
2136       smat_i   = subc->submatis1;
2137 
2138       nrqs        = smat_i->nrqs;
2139       nrqr        = smat_i->nrqr;
2140       rbuf1       = smat_i->rbuf1;
2141       rbuf2       = smat_i->rbuf2;
2142       rbuf3       = smat_i->rbuf3;
2143       req_source2 = smat_i->req_source2;
2144 
2145       sbuf1     = smat_i->sbuf1;
2146       sbuf2     = smat_i->sbuf2;
2147       ptr       = smat_i->ptr;
2148       tmp       = smat_i->tmp;
2149       ctr       = smat_i->ctr;
2150 
2151       pa          = smat_i->pa;
2152       req_size    = smat_i->req_size;
2153       req_source1 = smat_i->req_source1;
2154 
2155       allcolumns[i] = smat_i->allcolumns;
2156       row2proc[i]   = smat_i->row2proc;
2157       rmap[i]       = smat_i->rmap;
2158       cmap[i]       = smat_i->cmap;
2159     }
2160 
2161     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2162       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2163       smat_i = (Mat_SubSppt*)submats[0]->data;
2164 
2165       nrqs        = smat_i->nrqs;
2166       nrqr        = smat_i->nrqr;
2167       rbuf1       = smat_i->rbuf1;
2168       rbuf2       = smat_i->rbuf2;
2169       rbuf3       = smat_i->rbuf3;
2170       req_source2 = smat_i->req_source2;
2171 
2172       sbuf1       = smat_i->sbuf1;
2173       sbuf2       = smat_i->sbuf2;
2174       ptr         = smat_i->ptr;
2175       tmp         = smat_i->tmp;
2176       ctr         = smat_i->ctr;
2177 
2178       pa          = smat_i->pa;
2179       req_size    = smat_i->req_size;
2180       req_source1 = smat_i->req_source1;
2181 
2182       allcolumns[0] = PETSC_FALSE;
2183     }
2184   } else { /* scall == MAT_INITIAL_MATRIX */
2185     /* Get some new tags to keep the communication clean */
2186     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2187     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2188 
2189     /* evaluate communication - mesg to who, length of mesg, and buffer space
2190      required. Based on this, buffers are allocated, and data copied into them*/
2191     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2192 
2193     for (i=0; i<ismax; i++) {
2194       jmax   = nrow[i];
2195       irow_i = irow[i];
2196 
2197       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2198       row2proc[i] = row2proc_i;
2199 
2200       if (issorted[i]) proc = 0;
2201       for (j=0; j<jmax; j++) {
2202         if (!issorted[i]) proc = 0;
2203         row = irow_i[j];
2204         while (row >= C->rmap->range[proc+1]) proc++;
2205         w4[proc]++;
2206         row2proc_i[j] = proc; /* map row index to proc */
2207       }
2208       for (j=0; j<size; j++) {
2209         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2210       }
2211     }
2212 
2213     nrqs     = 0;              /* no of outgoing messages */
2214     msz      = 0;              /* total mesg length (for all procs) */
2215     w1[rank] = 0;              /* no mesg sent to self */
2216     w3[rank] = 0;
2217     for (i=0; i<size; i++) {
2218       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2219     }
2220     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2221     for (i=0,j=0; i<size; i++) {
2222       if (w1[i]) { pa[j] = i; j++; }
2223     }
2224 
2225     /* Each message would have a header = 1 + 2*(no of IS) + data */
2226     for (i=0; i<nrqs; i++) {
2227       j      = pa[i];
2228       w1[j] += w2[j] + 2* w3[j];
2229       msz   += w1[j];
2230     }
2231     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2232 
2233     /* Determine the number of messages to expect, their lengths, from from-ids */
2234     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2235     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2236 
2237     /* Now post the Irecvs corresponding to these messages */
2238     tag0 = ((PetscObject)C)->tag;
2239     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2240 
2241     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2242     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2243 
2244     /* Allocate Memory for outgoing messages */
2245     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2246     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2247     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2248 
2249     {
2250       PetscInt *iptr = tmp;
2251       k    = 0;
2252       for (i=0; i<nrqs; i++) {
2253         j        = pa[i];
2254         iptr    += k;
2255         sbuf1[j] = iptr;
2256         k        = w1[j];
2257       }
2258     }
2259 
2260     /* Form the outgoing messages. Initialize the header space */
2261     for (i=0; i<nrqs; i++) {
2262       j           = pa[i];
2263       sbuf1[j][0] = 0;
2264       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2265       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2266     }
2267 
2268     /* Parse the isrow and copy data into outbuf */
2269     for (i=0; i<ismax; i++) {
2270       row2proc_i = row2proc[i];
2271       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2272       irow_i = irow[i];
2273       jmax   = nrow[i];
2274       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2275         proc = row2proc_i[j];
2276         if (proc != rank) { /* copy to the outgoing buf*/
2277           ctr[proc]++;
2278           *ptr[proc] = irow_i[j];
2279           ptr[proc]++;
2280         }
2281       }
2282       /* Update the headers for the current IS */
2283       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2284         if ((ctr_j = ctr[j])) {
2285           sbuf1_j        = sbuf1[j];
2286           k              = ++sbuf1_j[0];
2287           sbuf1_j[2*k]   = ctr_j;
2288           sbuf1_j[2*k-1] = i;
2289         }
2290       }
2291     }
2292 
2293     /*  Now  post the sends */
2294     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2295     for (i=0; i<nrqs; ++i) {
2296       j    = pa[i];
2297       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2298     }
2299 
2300     /* Post Receives to capture the buffer size */
2301     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2302     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2303     rbuf2[0] = tmp + msz;
2304     for (i=1; i<nrqs; ++i) {
2305       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2306     }
2307     for (i=0; i<nrqs; ++i) {
2308       j    = pa[i];
2309       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2310     }
2311 
2312     /* Send to other procs the buf size they should allocate */
2313     /* Receive messages*/
2314     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2315     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2316     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2317     {
2318       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2319       PetscInt   *sbuf2_i;
2320 
2321       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2322       for (i=0; i<nrqr; ++i) {
2323         req_size[i] = 0;
2324         rbuf1_i        = rbuf1[i];
2325         start          = 2*rbuf1_i[0] + 1;
2326         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2327         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2328         sbuf2_i        = sbuf2[i];
2329         for (j=start; j<end; j++) {
2330           id              = rbuf1_i[j] - rstart;
2331           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2332           sbuf2_i[j]      = ncols;
2333           req_size[i] += ncols;
2334         }
2335         req_source1[i] = r_status1[i].MPI_SOURCE;
2336         /* form the header */
2337         sbuf2_i[0] = req_size[i];
2338         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2339 
2340         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2341       }
2342     }
2343     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2344     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2345     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2346 
2347     /* Receive messages*/
2348     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2349     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2350 
2351     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2352     for (i=0; i<nrqs; ++i) {
2353       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2354       req_source2[i] = r_status2[i].MPI_SOURCE;
2355       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2356     }
2357     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2358     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2359 
2360     /* Wait on sends1 and sends2 */
2361     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2362     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2363 
2364     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2365     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2366     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2367     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2368     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2369     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2370 
2371     /* Now allocate sending buffers for a->j, and send them off */
2372     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2373     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2374     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2375     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2376 
2377     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2378     {
2379       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2380       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2381       PetscInt cend = C->cmap->rend;
2382       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2383 
2384       for (i=0; i<nrqr; i++) {
2385         rbuf1_i   = rbuf1[i];
2386         sbuf_aj_i = sbuf_aj[i];
2387         ct1       = 2*rbuf1_i[0] + 1;
2388         ct2       = 0;
2389         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2390           kmax = rbuf1[i][2*j];
2391           for (k=0; k<kmax; k++,ct1++) {
2392             row    = rbuf1_i[ct1] - rstart;
2393             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2394             ncols  = nzA + nzB;
2395             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2396 
2397             /* load the column indices for this row into cols */
2398             cols = sbuf_aj_i + ct2;
2399 
2400             lwrite = 0;
2401             for (l=0; l<nzB; l++) {
2402               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2403             }
2404             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2405             for (l=0; l<nzB; l++) {
2406               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2407             }
2408 
2409             ct2 += ncols;
2410           }
2411         }
2412         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2413       }
2414     }
2415     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2416 
2417     /* create col map: global col of C -> local col of submatrices */
2418     {
2419       const PetscInt *icol_i;
2420 #if defined(PETSC_USE_CTABLE)
2421       for (i=0; i<ismax; i++) {
2422         if (!allcolumns[i]) {
2423           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2424 
2425           jmax   = ncol[i];
2426           icol_i = icol[i];
2427           cmap_i = cmap[i];
2428           for (j=0; j<jmax; j++) {
2429             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2430           }
2431         } else cmap[i] = NULL;
2432       }
2433 #else
2434       for (i=0; i<ismax; i++) {
2435         if (!allcolumns[i]) {
2436           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2437           jmax   = ncol[i];
2438           icol_i = icol[i];
2439           cmap_i = cmap[i];
2440           for (j=0; j<jmax; j++) {
2441             cmap_i[icol_i[j]] = j+1;
2442           }
2443         } else cmap[i] = NULL;
2444       }
2445 #endif
2446     }
2447 
2448     /* Create lens which is required for MatCreate... */
2449     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2450     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2451 
2452     if (ismax) {
2453       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2454     }
2455     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2456 
2457     /* Update lens from local data */
2458     for (i=0; i<ismax; i++) {
2459       row2proc_i = row2proc[i];
2460       jmax = nrow[i];
2461       if (!allcolumns[i]) cmap_i = cmap[i];
2462       irow_i = irow[i];
2463       lens_i = lens[i];
2464       for (j=0; j<jmax; j++) {
2465         row = irow_i[j];
2466         proc = row2proc_i[j];
2467         if (proc == rank) {
2468           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2469           if (!allcolumns[i]) {
2470             for (k=0; k<ncols; k++) {
2471 #if defined(PETSC_USE_CTABLE)
2472               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2473 #else
2474               tcol = cmap_i[cols[k]];
2475 #endif
2476               if (tcol) lens_i[j]++;
2477             }
2478           } else { /* allcolumns */
2479             lens_i[j] = ncols;
2480           }
2481           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2482         }
2483       }
2484     }
2485 
2486     /* Create row map: global row of C -> local row of submatrices */
2487 #if defined(PETSC_USE_CTABLE)
2488     for (i=0; i<ismax; i++) {
2489       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2490       irow_i = irow[i];
2491       jmax   = nrow[i];
2492       for (j=0; j<jmax; j++) {
2493       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2494       }
2495     }
2496 #else
2497     for (i=0; i<ismax; i++) {
2498       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2499       rmap_i = rmap[i];
2500       irow_i = irow[i];
2501       jmax   = nrow[i];
2502       for (j=0; j<jmax; j++) {
2503         rmap_i[irow_i[j]] = j;
2504       }
2505     }
2506 #endif
2507 
2508     /* Update lens from offproc data */
2509     {
2510       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2511 
2512       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2513       for (tmp2=0; tmp2<nrqs; tmp2++) {
2514         sbuf1_i = sbuf1[pa[tmp2]];
2515         jmax    = sbuf1_i[0];
2516         ct1     = 2*jmax+1;
2517         ct2     = 0;
2518         rbuf2_i = rbuf2[tmp2];
2519         rbuf3_i = rbuf3[tmp2];
2520         for (j=1; j<=jmax; j++) {
2521           is_no  = sbuf1_i[2*j-1];
2522           max1   = sbuf1_i[2*j];
2523           lens_i = lens[is_no];
2524           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2525           rmap_i = rmap[is_no];
2526           for (k=0; k<max1; k++,ct1++) {
2527 #if defined(PETSC_USE_CTABLE)
2528             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2529             row--;
2530             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2531 #else
2532             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2533 #endif
2534             max2 = rbuf2_i[ct1];
2535             for (l=0; l<max2; l++,ct2++) {
2536               if (!allcolumns[is_no]) {
2537 #if defined(PETSC_USE_CTABLE)
2538                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2539 #else
2540                 tcol = cmap_i[rbuf3_i[ct2]];
2541 #endif
2542                 if (tcol) lens_i[row]++;
2543               } else { /* allcolumns */
2544                 lens_i[row]++; /* lens_i[row] += max2 ? */
2545               }
2546             }
2547           }
2548         }
2549       }
2550     }
2551     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2552     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2553     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2554     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2555 
2556     /* Create the submatrices */
2557     for (i=0; i<ismax; i++) {
2558       PetscInt    rbs,cbs;
2559 
2560       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2561       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2562 
2563       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2564       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2565 
2566       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2567       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2568       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2569 
2570       /* create struct Mat_SubSppt and attached it to submat */
2571       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2572       subc = (Mat_SeqAIJ*)submats[i]->data;
2573       subc->submatis1 = smat_i;
2574 
2575       smat_i->destroy          = submats[i]->ops->destroy;
2576       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2577       submats[i]->factortype   = C->factortype;
2578 
2579       smat_i->id          = i;
2580       smat_i->nrqs        = nrqs;
2581       smat_i->nrqr        = nrqr;
2582       smat_i->rbuf1       = rbuf1;
2583       smat_i->rbuf2       = rbuf2;
2584       smat_i->rbuf3       = rbuf3;
2585       smat_i->sbuf2       = sbuf2;
2586       smat_i->req_source2 = req_source2;
2587 
2588       smat_i->sbuf1       = sbuf1;
2589       smat_i->ptr         = ptr;
2590       smat_i->tmp         = tmp;
2591       smat_i->ctr         = ctr;
2592 
2593       smat_i->pa           = pa;
2594       smat_i->req_size     = req_size;
2595       smat_i->req_source1  = req_source1;
2596 
2597       smat_i->allcolumns  = allcolumns[i];
2598       smat_i->singleis    = PETSC_FALSE;
2599       smat_i->row2proc    = row2proc[i];
2600       smat_i->rmap        = rmap[i];
2601       smat_i->cmap        = cmap[i];
2602     }
2603 
2604     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2605       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2606       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2607       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2608 
2609       /* create struct Mat_SubSppt and attached it to submat */
2610       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2611       submats[0]->data = (void*)smat_i;
2612 
2613       smat_i->destroy          = submats[0]->ops->destroy;
2614       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2615       submats[0]->factortype   = C->factortype;
2616 
2617       smat_i->id          = 0;
2618       smat_i->nrqs        = nrqs;
2619       smat_i->nrqr        = nrqr;
2620       smat_i->rbuf1       = rbuf1;
2621       smat_i->rbuf2       = rbuf2;
2622       smat_i->rbuf3       = rbuf3;
2623       smat_i->sbuf2       = sbuf2;
2624       smat_i->req_source2 = req_source2;
2625 
2626       smat_i->sbuf1       = sbuf1;
2627       smat_i->ptr         = ptr;
2628       smat_i->tmp         = tmp;
2629       smat_i->ctr         = ctr;
2630 
2631       smat_i->pa           = pa;
2632       smat_i->req_size     = req_size;
2633       smat_i->req_source1  = req_source1;
2634 
2635       smat_i->allcolumns  = PETSC_FALSE;
2636       smat_i->singleis    = PETSC_FALSE;
2637       smat_i->row2proc    = NULL;
2638       smat_i->rmap        = NULL;
2639       smat_i->cmap        = NULL;
2640     }
2641 
2642     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2643     ierr = PetscFree(lens);CHKERRQ(ierr);
2644     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2645     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2646 
2647   } /* endof scall == MAT_INITIAL_MATRIX */
2648 
2649   /* Post recv matrix values */
2650   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2651   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2652   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2653   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2654   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2655   for (i=0; i<nrqs; ++i) {
2656     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2657     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2658   }
2659 
2660   /* Allocate sending buffers for a->a, and send them off */
2661   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2662   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2663   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2664   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2665 
2666   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2667   {
2668     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2669     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2670     PetscInt    cend   = C->cmap->rend;
2671     PetscInt    *b_j   = b->j;
2672     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2673 
2674     for (i=0; i<nrqr; i++) {
2675       rbuf1_i   = rbuf1[i];
2676       sbuf_aa_i = sbuf_aa[i];
2677       ct1       = 2*rbuf1_i[0]+1;
2678       ct2       = 0;
2679       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2680         kmax = rbuf1_i[2*j];
2681         for (k=0; k<kmax; k++,ct1++) {
2682           row    = rbuf1_i[ct1] - rstart;
2683           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2684           ncols  = nzA + nzB;
2685           cworkB = b_j + b_i[row];
2686           vworkA = a_a + a_i[row];
2687           vworkB = b_a + b_i[row];
2688 
2689           /* load the column values for this row into vals*/
2690           vals = sbuf_aa_i+ct2;
2691 
2692           lwrite = 0;
2693           for (l=0; l<nzB; l++) {
2694             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2695           }
2696           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2697           for (l=0; l<nzB; l++) {
2698             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2699           }
2700 
2701           ct2 += ncols;
2702         }
2703       }
2704       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2705     }
2706   }
2707 
2708   /* Assemble the matrices */
2709   /* First assemble the local rows */
2710   for (i=0; i<ismax; i++) {
2711     row2proc_i = row2proc[i];
2712     subc      = (Mat_SeqAIJ*)submats[i]->data;
2713     imat_ilen = subc->ilen;
2714     imat_j    = subc->j;
2715     imat_i    = subc->i;
2716     imat_a    = subc->a;
2717 
2718     if (!allcolumns[i]) cmap_i = cmap[i];
2719     rmap_i = rmap[i];
2720     irow_i = irow[i];
2721     jmax   = nrow[i];
2722     for (j=0; j<jmax; j++) {
2723       row  = irow_i[j];
2724       proc = row2proc_i[j];
2725       if (proc == rank) {
2726         old_row = row;
2727 #if defined(PETSC_USE_CTABLE)
2728         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2729         row--;
2730 #else
2731         row = rmap_i[row];
2732 #endif
2733         ilen_row = imat_ilen[row];
2734         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2735         mat_i    = imat_i[row];
2736         mat_a    = imat_a + mat_i;
2737         mat_j    = imat_j + mat_i;
2738         if (!allcolumns[i]) {
2739           for (k=0; k<ncols; k++) {
2740 #if defined(PETSC_USE_CTABLE)
2741             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2742 #else
2743             tcol = cmap_i[cols[k]];
2744 #endif
2745             if (tcol) {
2746               *mat_j++ = tcol - 1;
2747               *mat_a++ = vals[k];
2748               ilen_row++;
2749             }
2750           }
2751         } else { /* allcolumns */
2752           for (k=0; k<ncols; k++) {
2753             *mat_j++ = cols[k];  /* global col index! */
2754             *mat_a++ = vals[k];
2755             ilen_row++;
2756           }
2757         }
2758         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2759 
2760         imat_ilen[row] = ilen_row;
2761       }
2762     }
2763   }
2764 
2765   /* Now assemble the off proc rows */
2766   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2767   for (tmp2=0; tmp2<nrqs; tmp2++) {
2768     sbuf1_i = sbuf1[pa[tmp2]];
2769     jmax    = sbuf1_i[0];
2770     ct1     = 2*jmax + 1;
2771     ct2     = 0;
2772     rbuf2_i = rbuf2[tmp2];
2773     rbuf3_i = rbuf3[tmp2];
2774     rbuf4_i = rbuf4[tmp2];
2775     for (j=1; j<=jmax; j++) {
2776       is_no     = sbuf1_i[2*j-1];
2777       rmap_i    = rmap[is_no];
2778       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2779       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2780       imat_ilen = subc->ilen;
2781       imat_j    = subc->j;
2782       imat_i    = subc->i;
2783       imat_a    = subc->a;
2784       max1      = sbuf1_i[2*j];
2785       for (k=0; k<max1; k++,ct1++) {
2786         row = sbuf1_i[ct1];
2787 #if defined(PETSC_USE_CTABLE)
2788         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2789         row--;
2790 #else
2791         row = rmap_i[row];
2792 #endif
2793         ilen  = imat_ilen[row];
2794         mat_i = imat_i[row];
2795         mat_a = imat_a + mat_i;
2796         mat_j = imat_j + mat_i;
2797         max2  = rbuf2_i[ct1];
2798         if (!allcolumns[is_no]) {
2799           for (l=0; l<max2; l++,ct2++) {
2800 #if defined(PETSC_USE_CTABLE)
2801             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2802 #else
2803             tcol = cmap_i[rbuf3_i[ct2]];
2804 #endif
2805             if (tcol) {
2806               *mat_j++ = tcol - 1;
2807               *mat_a++ = rbuf4_i[ct2];
2808               ilen++;
2809             }
2810           }
2811         } else { /* allcolumns */
2812           for (l=0; l<max2; l++,ct2++) {
2813             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2814             *mat_a++ = rbuf4_i[ct2];
2815             ilen++;
2816           }
2817         }
2818         imat_ilen[row] = ilen;
2819       }
2820     }
2821   }
2822 
2823   if (!iscsorted) { /* sort column indices of the rows */
2824     for (i=0; i<ismax; i++) {
2825       subc      = (Mat_SeqAIJ*)submats[i]->data;
2826       imat_j    = subc->j;
2827       imat_i    = subc->i;
2828       imat_a    = subc->a;
2829       imat_ilen = subc->ilen;
2830 
2831       if (allcolumns[i]) continue;
2832       jmax = nrow[i];
2833       for (j=0; j<jmax; j++) {
2834         mat_i = imat_i[j];
2835         mat_a = imat_a + mat_i;
2836         mat_j = imat_j + mat_i;
2837         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2838       }
2839     }
2840   }
2841 
2842   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2843   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2844   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2845   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2846   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2847 
2848   /* Restore the indices */
2849   for (i=0; i<ismax; i++) {
2850     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2851     if (!allcolumns[i]) {
2852       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2853     }
2854   }
2855 
2856   for (i=0; i<ismax; i++) {
2857     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2858     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2859   }
2860 
2861   /* Destroy allocated memory */
2862   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2863   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2864   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2865 
2866   for (i=0; i<nrqs; ++i) {
2867     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2868   }
2869   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2870 
2871   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2872   PetscFunctionReturn(0);
2873 }
2874 
2875 /*
2876  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2877  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2878  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2879  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2880  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2881  state, and needs to be "assembled" later by compressing B's column space.
2882 
2883  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2884  Following this call, C->A & C->B have been created, even if empty.
2885  */
2886 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2887 {
2888   /* If making this function public, change the error returned in this function away from _PLIB. */
2889   PetscErrorCode ierr;
2890   Mat_MPIAIJ     *aij;
2891   Mat_SeqAIJ     *Baij;
2892   PetscBool      seqaij,Bdisassembled;
2893   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2894   PetscScalar    v;
2895   const PetscInt *rowindices,*colindices;
2896 
2897   PetscFunctionBegin;
2898   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2899   if (A) {
2900     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2901     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2902     if (rowemb) {
2903       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2904       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2905     } else {
2906       if (C->rmap->n != A->rmap->n) {
2907 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2908       }
2909     }
2910     if (dcolemb) {
2911       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2912       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2913     } else {
2914       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2915     }
2916   }
2917   if (B) {
2918     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2919     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2920     if (rowemb) {
2921       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2922       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2923     } else {
2924       if (C->rmap->n != B->rmap->n) {
2925 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2926       }
2927     }
2928     if (ocolemb) {
2929       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2930       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2931     } else {
2932       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2933     }
2934   }
2935 
2936   aij = (Mat_MPIAIJ*)C->data;
2937   if (!aij->A) {
2938     /* Mimic parts of MatMPIAIJSetPreallocation() */
2939     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2940     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2941     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2942     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2943     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2944   }
2945   if (A) {
2946     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2947   } else {
2948     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2949   }
2950   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2951     /*
2952       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2953       need to "disassemble" B -- convert it to using C's global indices.
2954       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2955 
2956       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2957       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2958 
2959       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2960       At least avoid calling MatSetValues() and the implied searches?
2961     */
2962 
2963     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2964 #if defined(PETSC_USE_CTABLE)
2965       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2966 #else
2967       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2968       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2969       if (aij->B) {
2970         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2971       }
2972 #endif
2973       ngcol = 0;
2974       if (aij->lvec) {
2975 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2976       }
2977       if (aij->garray) {
2978 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2979 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2980       }
2981       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2982       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2983     }
2984     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2985       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2986     }
2987     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2988       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2989     }
2990   }
2991   Bdisassembled = PETSC_FALSE;
2992   if (!aij->B) {
2993     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2994     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2995     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2996     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2997     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2998     Bdisassembled = PETSC_TRUE;
2999   }
3000   if (B) {
3001     Baij = (Mat_SeqAIJ*)B->data;
3002     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3003       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3004       for (i=0; i<B->rmap->n; i++) {
3005 	nz[i] = Baij->i[i+1] - Baij->i[i];
3006       }
3007       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3008       ierr = PetscFree(nz);CHKERRQ(ierr);
3009     }
3010 
3011     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3012     shift = rend-rstart;
3013     count = 0;
3014     rowindices = NULL;
3015     colindices = NULL;
3016     if (rowemb) {
3017       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3018     }
3019     if (ocolemb) {
3020       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3021     }
3022     for (i=0; i<B->rmap->n; i++) {
3023       PetscInt row;
3024       row = i;
3025       if (rowindices) row = rowindices[i];
3026       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3027 	col  = Baij->j[count];
3028 	if (colindices) col = colindices[col];
3029 	if (Bdisassembled && col>=rstart) col += shift;
3030 	v    = Baij->a[count];
3031 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3032 	++count;
3033       }
3034     }
3035     /* No assembly for aij->B is necessary. */
3036     /* FIXME: set aij->B's nonzerostate correctly. */
3037   } else {
3038     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3039   }
3040   C->preallocated  = PETSC_TRUE;
3041   C->was_assembled = PETSC_FALSE;
3042   C->assembled     = PETSC_FALSE;
3043    /*
3044       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3045       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3046    */
3047   PetscFunctionReturn(0);
3048 }
3049 
3050 /*
3051   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3052  */
3053 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3054 {
3055   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3056 
3057   PetscFunctionBegin;
3058   PetscValidPointer(A,2);
3059   PetscValidPointer(B,3);
3060   /* FIXME: make sure C is assembled */
3061   *A = aij->A;
3062   *B = aij->B;
3063   /* Note that we don't incref *A and *B, so be careful! */
3064   PetscFunctionReturn(0);
3065 }
3066 
3067 /*
3068   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3069   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3070 */
3071 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3072                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3073 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3074 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3075 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3076 {
3077   PetscErrorCode ierr;
3078   PetscMPIInt    isize,flag;
3079   PetscInt       i,ii,cismax,ispar;
3080   Mat            *A,*B;
3081   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3082 
3083   PetscFunctionBegin;
3084   if (!ismax) PetscFunctionReturn(0);
3085 
3086   for (i = 0, cismax = 0; i < ismax; ++i) {
3087     PetscMPIInt isize;
3088     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3089     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3090     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3091     if (isize > 1) ++cismax;
3092   }
3093 
3094   /*
3095      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3096      ispar counts the number of parallel ISs across C's comm.
3097   */
3098   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3099   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3100     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3101     PetscFunctionReturn(0);
3102   }
3103 
3104   /* if (ispar) */
3105   /*
3106     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3107     These are used to extract the off-diag portion of the resulting parallel matrix.
3108     The row IS for the off-diag portion is the same as for the diag portion,
3109     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3110   */
3111   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3112   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3113   for (i = 0, ii = 0; i < ismax; ++i) {
3114     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3115     if (isize > 1) {
3116       /*
3117 	 TODO: This is the part that's ***NOT SCALABLE***.
3118 	 To fix this we need to extract just the indices of C's nonzero columns
3119 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3120 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3121 	 to be done without serializing on the IS list, so, most likely, it is best
3122 	 done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3123       */
3124       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3125       /* Now we have to
3126 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3127 	     were sorted on each rank, concatenated they might no longer be sorted;
3128 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3129 	     indices in the nondecreasing order to the original index positions.
3130 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3131       */
3132       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3133       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3134       ++ii;
3135     }
3136   }
3137   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3138   for (i = 0, ii = 0; i < ismax; ++i) {
3139     PetscInt       j,issize;
3140     const PetscInt *indices;
3141 
3142     /*
3143        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3144      */
3145     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3146     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3147     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3148     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3149     for (j = 1; j < issize; ++j) {
3150       if (indices[j] == indices[j-1]) {
3151 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3152       }
3153     }
3154     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3155 
3156 
3157     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3158     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3159     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3160     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3161     for (j = 1; j < issize; ++j) {
3162       if (indices[j-1] == indices[j]) {
3163 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3164       }
3165     }
3166     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3167     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3168     if (isize > 1) {
3169       cisrow[ii] = isrow[i];
3170       ++ii;
3171     }
3172   }
3173   /*
3174     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3175     array of sequential matrices underlying the resulting parallel matrices.
3176     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3177     contain duplicates.
3178 
3179     There are as many diag matrices as there are original index sets. There are only as many parallel
3180     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3181 
3182     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3183     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3184       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3185       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3186       with A[i] and B[ii] extracted from the corresponding MPI submat.
3187     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3188       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3189       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3190       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3191       values into A[i] and B[ii] sitting inside the corresponding submat.
3192     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3193       A[i], B[ii], AA[i] or BB[ii] matrices.
3194   */
3195   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3196   if (scall == MAT_INITIAL_MATRIX) {
3197     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3198   }
3199 
3200   /* Now obtain the sequential A and B submatrices separately. */
3201   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3202   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3203   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3204 
3205   /*
3206     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3207     matrices A & B have been extracted directly into the parallel matrices containing them, or
3208     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3209     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3210     to have the same sparsity pattern.
3211     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3212     must be constructed for C. This is done by setseqmat(s).
3213   */
3214   for (i = 0, ii = 0; i < ismax; ++i) {
3215     /*
3216        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3217        That way we can avoid sorting and computing permutations when reusing.
3218        To this end:
3219         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3220 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3221 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3222     */
3223     MatStructure pattern;
3224     pattern = DIFFERENT_NONZERO_PATTERN;
3225 
3226     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3227     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3228     if (isize > 1) {
3229       if (scall == MAT_INITIAL_MATRIX) {
3230 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3231 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3232 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3233 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3234 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3235       }
3236       /*
3237 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3238       */
3239       {
3240 	Mat AA,BB;
3241         AA = A[i];
3242         BB = B[ii];
3243 	if (AA || BB) {
3244 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3245 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3246 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3247 	}
3248 
3249         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3250       }
3251       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3252       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3253       ++ii;
3254     } else { /* if (isize == 1) */
3255       if (scall == MAT_REUSE_MATRIX) {
3256         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3257       }
3258       if (isrow_p[i] || iscol_p[i]) {
3259         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3260         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3261 	/* Otherwise A is extracted straight into (*submats)[i]. */
3262 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3263 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3264       } else (*submat)[i] = A[i];
3265     }
3266     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3267     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3268   }
3269   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3270   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3271   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3272   ierr = PetscFree(A);CHKERRQ(ierr);
3273   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3274   PetscFunctionReturn(0);
3275 }
3276 
3277 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3278 {
3279   PetscErrorCode ierr;
3280 
3281   PetscFunctionBegin;
3282   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3283   PetscFunctionReturn(0);
3284 }
3285