xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision bef158480efac06de457f7a665168877ab3c2fd7)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++) {
82     length_i  = length[i];
83     indices_i = indices[i];
84     for (j=0; j<length_i; j++) {
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank) {
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows) {
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++) {
115     if (tosizes_temp[i]) {
116       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117       tosizes_temp[i] = nto; /* a map from processor to index */
118       toranks[nto++]  = i; /* processor */
119     }
120   }
121   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   toffsets[0] = 0;
123   for (i=0; i<nto; i++) {
124     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
125     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
126   }
127   /* send information to other processors */
128   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
129   nrecvrows = 0;
130   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
131   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
132   nrecvrows = 0;
133   for (i=0; i<nfrom; i++) {
134     for (j=0; j<fromsizes[2*i]; j++) {
135       remote[nrecvrows].rank    = fromranks[i];
136       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
137     }
138   }
139   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
140   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
141   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
142   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
143   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
144   /* message pair <no of is, row>  */
145   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
146   for (i=0; i<nrrows; i++) {
147     owner = rrow_ranks[i]; /* processor */
148     j     = tosizes_temp[owner]; /* index */
149     todata[toffsets[j]++] = rrow_isids[i];
150     todata[toffsets[j]++] = remoterows[i];
151   }
152   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
153   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
154   ierr = PetscFree(toffsets);CHKERRQ(ierr);
155   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
157   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
158   /* send rows belonging to the remote so that then we could get the overlapping data back */
159   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
160   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
161   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
162   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
163   ierr = PetscFree(fromranks);CHKERRQ(ierr);
164   nrecvrows = 0;
165   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
166   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
167   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
168   nrecvrows = 0;
169   for (i=0; i<nto; i++) {
170     for (j=0; j<tosizes[2*i]; j++) {
171       remote[nrecvrows].rank    = toranks[i];
172       remote[nrecvrows++].index = tosizes[2*i+1]+j;
173     }
174   }
175   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
176   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
177   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
178   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
179   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
180   /* overlap communication and computation */
181   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
182   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
183   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
184   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
185   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
186   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
187   ierr = PetscFree(toranks);CHKERRQ(ierr);
188   ierr = PetscFree(tosizes);CHKERRQ(ierr);
189   ierr = PetscFree(todata);CHKERRQ(ierr);
190   PetscFunctionReturn(0);
191 }
192 
193 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
194 {
195   PetscInt         *isz,isz_i,i,j,is_id, data_size;
196   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
197   const PetscInt   *indices_i_temp;
198   MPI_Comm         *iscomms;
199   PetscErrorCode    ierr;
200 
201   PetscFunctionBegin;
202   max_lsize = 0;
203   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
204   for (i=0; i<nidx; i++){
205     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
206     max_lsize = lsize>max_lsize ? lsize:max_lsize;
207     isz[i]    = lsize;
208   }
209   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
212     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
214     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
215     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
216   }
217   /* retrieve information to get row id and its overlap */
218   for (i=0; i<nrecvs;){
219     is_id     = recvdata[i++];
220     data_size = recvdata[i++];
221     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
222     isz_i     = isz[is_id];
223     for (j=0; j< data_size; j++){
224       col = recvdata[i++];
225       indices_i[isz_i++] = col;
226     }
227     isz[is_id] = isz_i;
228   }
229   /* remove duplicate entities */
230   for (i=0; i<nidx; i++){
231     indices_i = indices_temp+(max_lsize+nrecvs)*i;
232     isz_i     = isz[i];
233     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
234     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
235     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
236   }
237   ierr = PetscFree(isz);CHKERRQ(ierr);
238   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
243 {
244   PetscLayout       rmap,cmap;
245   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
246   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
247   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
248   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
249   Mat               amat,bmat;
250   PetscMPIInt       rank;
251   PetscBool         done;
252   MPI_Comm          comm;
253   PetscErrorCode    ierr;
254 
255   PetscFunctionBegin;
256   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
257   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
258   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
259   /* Even if the mat is symmetric, we still assume it is not symmetric */
260   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
263   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
264   /* total number of nonzero values is used to estimate the memory usage in the next step */
265   tnz  = ai[an]+bi[bn];
266   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
267   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
268   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
269   /* to find the longest message */
270   max_fszs = 0;
271   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
272   /* better way to estimate number of nonzero in the mat??? */
273   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
274   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
275   rows_pos  = 0;
276   totalrows = 0;
277   for (i=0; i<nfrom; i++){
278     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
279     /* group data together */
280     for (j=0; j<fromsizes[2*i]; j+=2){
281       is_id                       = fromrows[rows_pos++];/* no of is */
282       rows_i                      = rows_data[is_id];
283       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
284     }
285     /* estimate a space to avoid multiple allocations  */
286     for (j=0; j<nidx; j++){
287       indvc_ij = 0;
288       rows_i   = rows_data[j];
289       for (l=0; l<rows_pos_i[j]; l++){
290         row    = rows_i[l]-rstart;
291         start  = ai[row];
292         end    = ai[row+1];
293         for (k=start; k<end; k++){ /* Amat */
294           col = aj[k] + cstart;
295           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
296         }
297         start = bi[row];
298         end   = bi[row+1];
299         for (k=start; k<end; k++) { /* Bmat */
300           col = gcols[bj[k]];
301           indices_tmp[indvc_ij++] = col;
302         }
303       }
304       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
305       indv_counts[i*nidx+j] = indvc_ij;
306       totalrows            += indvc_ij;
307     }
308   }
309   /* message triple <no of is, number of rows, rows> */
310   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
311   totalrows = 0;
312   rows_pos  = 0;
313   /* use this code again */
314   for (i=0;i<nfrom;i++){
315     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
316     for (j=0; j<fromsizes[2*i]; j+=2){
317       is_id                       = fromrows[rows_pos++];
318       rows_i                      = rows_data[is_id];
319       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
320     }
321     /* add data  */
322     for (j=0; j<nidx; j++){
323       if (!indv_counts[i*nidx+j]) continue;
324       indvc_ij = 0;
325       sbdata[totalrows++] = j;
326       sbdata[totalrows++] = indv_counts[i*nidx+j];
327       sbsizes[2*i]       += 2;
328       rows_i              = rows_data[j];
329       for (l=0; l<rows_pos_i[j]; l++){
330         row   = rows_i[l]-rstart;
331         start = ai[row];
332         end   = ai[row+1];
333         for (k=start; k<end; k++){ /* Amat */
334           col = aj[k] + cstart;
335           indices_tmp[indvc_ij++] = col;
336         }
337         start = bi[row];
338         end   = bi[row+1];
339         for (k=start; k<end; k++) { /* Bmat */
340           col = gcols[bj[k]];
341           indices_tmp[indvc_ij++] = col;
342         }
343       }
344       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
345       sbsizes[2*i]  += indvc_ij;
346       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
347       totalrows += indvc_ij;
348     }
349   }
350   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
351   offsets[0] = 0;
352   for (i=0; i<nfrom; i++){
353     offsets[i+1]   = offsets[i] + sbsizes[2*i];
354     sbsizes[2*i+1] = offsets[i];
355   }
356   ierr = PetscFree(offsets);CHKERRQ(ierr);
357   if (sbrowsizes) *sbrowsizes = sbsizes;
358   if (sbrows) *sbrows = sbdata;
359   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
360   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
361   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
366 {
367   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
368   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
369   PetscInt          lsize,lsize_tmp;
370   PetscMPIInt       rank,owner;
371   Mat               amat,bmat;
372   PetscBool         done;
373   PetscLayout       cmap,rmap;
374   MPI_Comm          comm;
375   PetscErrorCode    ierr;
376 
377   PetscFunctionBegin;
378   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
379   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
380   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
381   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
384   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
385   /* is it a safe way to compute number of nonzero values ? */
386   tnz  = ai[an]+bi[bn];
387   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
388   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
389   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
390   /* it is a better way to estimate memory than the old implementation
391    * where global size of matrix is used
392    * */
393   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
394   for (i=0; i<nidx; i++) {
395     MPI_Comm iscomm;
396 
397     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
398     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
399     lsize_tmp = 0;
400     for (j=0; j<lsize; j++) {
401       owner = -1;
402       row   = indices[j];
403       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
404       if (owner != rank) continue;
405       /* local number */
406       row  -= rstart;
407       start = ai[row];
408       end   = ai[row+1];
409       for (k=start; k<end; k++) { /* Amat */
410         col = aj[k] + cstart;
411         indices_temp[lsize_tmp++] = col;
412       }
413       start = bi[row];
414       end   = bi[row+1];
415       for (k=start; k<end; k++) { /* Bmat */
416         col = gcols[bj[k]];
417         indices_temp[lsize_tmp++] = col;
418       }
419     }
420    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
421    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
422    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
423    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
424    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
425    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
426   }
427   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
428   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
429   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
430   PetscFunctionReturn(0);
431 }
432 
433 
434 /*
435   Sample message format:
436   If a processor A wants processor B to process some elements corresponding
437   to index sets is[1],is[5]
438   mesg [0] = 2   (no of index sets in the mesg)
439   -----------
440   mesg [1] = 1 => is[1]
441   mesg [2] = sizeof(is[1]);
442   -----------
443   mesg [3] = 5  => is[5]
444   mesg [4] = sizeof(is[5]);
445   -----------
446   mesg [5]
447   mesg [n]  datas[1]
448   -----------
449   mesg[n+1]
450   mesg[m]  data(is[5])
451   -----------
452 
453   Notes:
454   nrqs - no of requests sent (or to be sent out)
455   nrqr - no of requests received (which have to be or which have been processed)
456 */
457 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
458 {
459   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
460   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
461   const PetscInt **idx,*idx_i;
462   PetscInt       *n,**data,len;
463 #if defined(PETSC_USE_CTABLE)
464   PetscTable     *table_data,table_data_i;
465   PetscInt       *tdata,tcount,tcount_max;
466 #else
467   PetscInt       *data_i,*d_p;
468 #endif
469   PetscErrorCode ierr;
470   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
471   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
472   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
473   PetscBT        *table;
474   MPI_Comm       comm;
475   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476   MPI_Status     *s_status,*recv_status;
477   MPI_Comm       *iscomms;
478   char           *t_p;
479 
480   PetscFunctionBegin;
481   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
482   size = c->size;
483   rank = c->rank;
484   M    = C->rmap->N;
485 
486   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
487   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
488 
489   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
490 
491   for (i=0; i<imax; i++) {
492     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
493     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
494   }
495 
496   /* evaluate communication - mesg to who,length of mesg, and buffer space
497      required. Based on this, buffers are allocated, and data copied into them  */
498   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
499   for (i=0; i<imax; i++) {
500     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
501     idx_i = idx[i];
502     len   = n[i];
503     for (j=0; j<len; j++) {
504       row = idx_i[j];
505       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
506       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
507       w4[proc]++;
508     }
509     for (j=0; j<size; j++) {
510       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
511     }
512   }
513 
514   nrqs     = 0;              /* no of outgoing messages */
515   msz      = 0;              /* total mesg length (for all proc */
516   w1[rank] = 0;              /* no mesg sent to intself */
517   w3[rank] = 0;
518   for (i=0; i<size; i++) {
519     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
520   }
521   /* pa - is list of processors to communicate with */
522   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
523   for (i=0,j=0; i<size; i++) {
524     if (w1[i]) {pa[j] = i; j++;}
525   }
526 
527   /* Each message would have a header = 1 + 2*(no of IS) + data */
528   for (i=0; i<nrqs; i++) {
529     j      = pa[i];
530     w1[j] += w2[j] + 2*w3[j];
531     msz   += w1[j];
532   }
533 
534   /* Determine the number of messages to expect, their lengths, from from-ids */
535   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
536   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
537 
538   /* Now post the Irecvs corresponding to these messages */
539   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
540 
541   /* Allocate Memory for outgoing messages */
542   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
543   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
544   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
545 
546   {
547     PetscInt *iptr = tmp,ict  = 0;
548     for (i=0; i<nrqs; i++) {
549       j         = pa[i];
550       iptr     +=  ict;
551       outdat[j] = iptr;
552       ict       = w1[j];
553     }
554   }
555 
556   /* Form the outgoing messages */
557   /* plug in the headers */
558   for (i=0; i<nrqs; i++) {
559     j            = pa[i];
560     outdat[j][0] = 0;
561     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
562     ptr[j]       = outdat[j] + 2*w3[j] + 1;
563   }
564 
565   /* Memory for doing local proc's work */
566   {
567     PetscInt M_BPB_imax = 0;
568 #if defined(PETSC_USE_CTABLE)
569     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
570     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
571     for (i=0; i<imax; i++) {
572       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
573     }
574     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
575     for (i=0; i<imax; i++) {
576       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
577     }
578 #else
579     PetscInt Mimax = 0;
580     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
581     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
582     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
583     for (i=0; i<imax; i++) {
584       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
585       data[i]  = d_p + M*i;
586     }
587 #endif
588   }
589 
590   /* Parse the IS and update local tables and the outgoing buf with the data */
591   {
592     PetscInt n_i,isz_i,*outdat_j,ctr_j;
593     PetscBT  table_i;
594 
595     for (i=0; i<imax; i++) {
596       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
597       n_i     = n[i];
598       table_i = table[i];
599       idx_i   = idx[i];
600 #if defined(PETSC_USE_CTABLE)
601       table_data_i = table_data[i];
602 #else
603       data_i  = data[i];
604 #endif
605       isz_i   = isz[i];
606       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
607         row  = idx_i[j];
608         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
609         if (proc != rank) { /* copy to the outgoing buffer */
610           ctr[proc]++;
611           *ptr[proc] = row;
612           ptr[proc]++;
613         } else if (!PetscBTLookupSet(table_i,row)) {
614 #if defined(PETSC_USE_CTABLE)
615           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
616 #else
617           data_i[isz_i] = row; /* Update the local table */
618 #endif
619           isz_i++;
620         }
621       }
622       /* Update the headers for the current IS */
623       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
624         if ((ctr_j = ctr[j])) {
625           outdat_j        = outdat[j];
626           k               = ++outdat_j[0];
627           outdat_j[2*k]   = ctr_j;
628           outdat_j[2*k-1] = i;
629         }
630       }
631       isz[i] = isz_i;
632     }
633   }
634 
635   /*  Now  post the sends */
636   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
637   for (i=0; i<nrqs; ++i) {
638     j    = pa[i];
639     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
640   }
641 
642   /* No longer need the original indices */
643   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
644   for (i=0; i<imax; ++i) {
645     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
646     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
647   }
648   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
649 
650   for (i=0; i<imax; ++i) {
651     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
652   }
653 
654   /* Do Local work */
655 #if defined(PETSC_USE_CTABLE)
656   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
657 #else
658   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
659 #endif
660 
661   /* Receive messages */
662   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
663   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
664 
665   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
666   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
667 
668   /* Phase 1 sends are complete - deallocate buffers */
669   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
670   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
671 
672   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
673   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
674   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
675   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
676   ierr = PetscFree(rbuf);CHKERRQ(ierr);
677 
678 
679   /* Send the data back */
680   /* Do a global reduction to know the buffer space req for incoming messages */
681   {
682     PetscMPIInt *rw1;
683 
684     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
685 
686     for (i=0; i<nrqr; ++i) {
687       proc = recv_status[i].MPI_SOURCE;
688 
689       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
690       rw1[proc] = isz1[i];
691     }
692     ierr = PetscFree(onodes1);CHKERRQ(ierr);
693     ierr = PetscFree(olengths1);CHKERRQ(ierr);
694 
695     /* Determine the number of messages to expect, their lengths, from from-ids */
696     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
697     ierr = PetscFree(rw1);CHKERRQ(ierr);
698   }
699   /* Now post the Irecvs corresponding to these messages */
700   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
701 
702   /* Now  post the sends */
703   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
704   for (i=0; i<nrqr; ++i) {
705     j    = recv_status[i].MPI_SOURCE;
706     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
707   }
708 
709   /* receive work done on other processors */
710   {
711     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
712     PetscMPIInt idex;
713     PetscBT     table_i;
714     MPI_Status  *status2;
715 
716     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
717     for (i=0; i<nrqs; ++i) {
718       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
719       /* Process the message */
720       rbuf2_i = rbuf2[idex];
721       ct1     = 2*rbuf2_i[0]+1;
722       jmax    = rbuf2[idex][0];
723       for (j=1; j<=jmax; j++) {
724         max     = rbuf2_i[2*j];
725         is_no   = rbuf2_i[2*j-1];
726         isz_i   = isz[is_no];
727         table_i = table[is_no];
728 #if defined(PETSC_USE_CTABLE)
729         table_data_i = table_data[is_no];
730 #else
731         data_i  = data[is_no];
732 #endif
733         for (k=0; k<max; k++,ct1++) {
734           row = rbuf2_i[ct1];
735           if (!PetscBTLookupSet(table_i,row)) {
736 #if defined(PETSC_USE_CTABLE)
737             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
738 #else
739             data_i[isz_i] = row;
740 #endif
741             isz_i++;
742           }
743         }
744         isz[is_no] = isz_i;
745       }
746     }
747 
748     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
749     ierr = PetscFree(status2);CHKERRQ(ierr);
750   }
751 
752 #if defined(PETSC_USE_CTABLE)
753   tcount_max = 0;
754   for (i=0; i<imax; ++i) {
755     table_data_i = table_data[i];
756     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
757     if (tcount_max < tcount) tcount_max = tcount;
758   }
759   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
760 #endif
761 
762   for (i=0; i<imax; ++i) {
763 #if defined(PETSC_USE_CTABLE)
764     PetscTablePosition tpos;
765     table_data_i = table_data[i];
766 
767     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
768     while (tpos) {
769       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
770       tdata[--j] = --k;
771     }
772     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
773 #else
774     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
775 #endif
776     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
777   }
778 
779   ierr = PetscFree(iscomms);CHKERRQ(ierr);
780   ierr = PetscFree(onodes2);CHKERRQ(ierr);
781   ierr = PetscFree(olengths2);CHKERRQ(ierr);
782 
783   ierr = PetscFree(pa);CHKERRQ(ierr);
784   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
785   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
786   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
787   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
788   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
789   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
790   ierr = PetscFree(s_status);CHKERRQ(ierr);
791   ierr = PetscFree(recv_status);CHKERRQ(ierr);
792   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
793   ierr = PetscFree(xdata);CHKERRQ(ierr);
794   ierr = PetscFree(isz1);CHKERRQ(ierr);
795 #if defined(PETSC_USE_CTABLE)
796   for (i=0; i<imax; i++) {
797     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
798   }
799   ierr = PetscFree(table_data);CHKERRQ(ierr);
800   ierr = PetscFree(tdata);CHKERRQ(ierr);
801   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
802 #else
803   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
804 #endif
805   PetscFunctionReturn(0);
806 }
807 
808 /*
809    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
810        the work on the local processor.
811 
812      Inputs:
813       C      - MAT_MPIAIJ;
814       imax - total no of index sets processed at a time;
815       table  - an array of char - size = m bits.
816 
817      Output:
818       isz    - array containing the count of the solution elements corresponding
819                to each index set;
820       data or table_data  - pointer to the solutions
821 */
822 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
823 {
824   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
825   Mat        A  = c->A,B = c->B;
826   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
827   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
828   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
829   PetscBT    table_i;
830 #if defined(PETSC_USE_CTABLE)
831   PetscTable         table_data_i;
832   PetscErrorCode     ierr;
833   PetscTablePosition tpos;
834   PetscInt           tcount,*tdata;
835 #else
836   PetscInt           *data_i;
837 #endif
838 
839   PetscFunctionBegin;
840   rstart = C->rmap->rstart;
841   cstart = C->cmap->rstart;
842   ai     = a->i;
843   aj     = a->j;
844   bi     = b->i;
845   bj     = b->j;
846   garray = c->garray;
847 
848   for (i=0; i<imax; i++) {
849 #if defined(PETSC_USE_CTABLE)
850     /* copy existing entries of table_data_i into tdata[] */
851     table_data_i = table_data[i];
852     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
853     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
854 
855     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
856     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
857     while (tpos) {
858       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
859       tdata[--j] = --row;
860       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
861     }
862 #else
863     data_i  = data[i];
864 #endif
865     table_i = table[i];
866     isz_i   = isz[i];
867     max     = isz[i];
868 
869     for (j=0; j<max; j++) {
870 #if defined(PETSC_USE_CTABLE)
871       row   = tdata[j] - rstart;
872 #else
873       row   = data_i[j] - rstart;
874 #endif
875       start = ai[row];
876       end   = ai[row+1];
877       for (k=start; k<end; k++) { /* Amat */
878         val = aj[k] + cstart;
879         if (!PetscBTLookupSet(table_i,val)) {
880 #if defined(PETSC_USE_CTABLE)
881           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
882 #else
883           data_i[isz_i] = val;
884 #endif
885           isz_i++;
886         }
887       }
888       start = bi[row];
889       end   = bi[row+1];
890       for (k=start; k<end; k++) { /* Bmat */
891         val = garray[bj[k]];
892         if (!PetscBTLookupSet(table_i,val)) {
893 #if defined(PETSC_USE_CTABLE)
894           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
895 #else
896           data_i[isz_i] = val;
897 #endif
898           isz_i++;
899         }
900       }
901     }
902     isz[i] = isz_i;
903 
904 #if defined(PETSC_USE_CTABLE)
905     ierr = PetscFree(tdata);CHKERRQ(ierr);
906 #endif
907   }
908   PetscFunctionReturn(0);
909 }
910 
911 /*
912       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
913          and return the output
914 
915          Input:
916            C    - the matrix
917            nrqr - no of messages being processed.
918            rbuf - an array of pointers to the received requests
919 
920          Output:
921            xdata - array of messages to be sent back
922            isz1  - size of each message
923 
924   For better efficiency perhaps we should malloc separately each xdata[i],
925 then if a remalloc is required we need only copy the data for that one row
926 rather then all previous rows as it is now where a single large chunck of
927 memory is used.
928 
929 */
930 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
931 {
932   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
933   Mat            A  = c->A,B = c->B;
934   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
935   PetscErrorCode ierr;
936   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
937   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
938   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
939   PetscInt       *rbuf_i,kmax,rbuf_0;
940   PetscBT        xtable;
941 
942   PetscFunctionBegin;
943   m      = C->rmap->N;
944   rstart = C->rmap->rstart;
945   cstart = C->cmap->rstart;
946   ai     = a->i;
947   aj     = a->j;
948   bi     = b->i;
949   bj     = b->j;
950   garray = c->garray;
951 
952 
953   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
954     rbuf_i =  rbuf[i];
955     rbuf_0 =  rbuf_i[0];
956     ct    += rbuf_0;
957     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
958   }
959 
960   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
961   else max1 = 1;
962   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
963   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
964   ++no_malloc;
965   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
966   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
967 
968   ct3 = 0;
969   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
970     rbuf_i =  rbuf[i];
971     rbuf_0 =  rbuf_i[0];
972     ct1    =  2*rbuf_0+1;
973     ct2    =  ct1;
974     ct3   += ct1;
975     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
976       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
977       oct2 = ct2;
978       kmax = rbuf_i[2*j];
979       for (k=0; k<kmax; k++,ct1++) {
980         row = rbuf_i[ct1];
981         if (!PetscBTLookupSet(xtable,row)) {
982           if (!(ct3 < mem_estimate)) {
983             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
984             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
985             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
986             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
987             xdata[0]     = tmp;
988             mem_estimate = new_estimate; ++no_malloc;
989             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
990           }
991           xdata[i][ct2++] = row;
992           ct3++;
993         }
994       }
995       for (k=oct2,max2=ct2; k<max2; k++) {
996         row   = xdata[i][k] - rstart;
997         start = ai[row];
998         end   = ai[row+1];
999         for (l=start; l<end; l++) {
1000           val = aj[l] + cstart;
1001           if (!PetscBTLookupSet(xtable,val)) {
1002             if (!(ct3 < mem_estimate)) {
1003               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1004               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1005               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1006               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1007               xdata[0]     = tmp;
1008               mem_estimate = new_estimate; ++no_malloc;
1009               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1010             }
1011             xdata[i][ct2++] = val;
1012             ct3++;
1013           }
1014         }
1015         start = bi[row];
1016         end   = bi[row+1];
1017         for (l=start; l<end; l++) {
1018           val = garray[bj[l]];
1019           if (!PetscBTLookupSet(xtable,val)) {
1020             if (!(ct3 < mem_estimate)) {
1021               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1022               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1023               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1024               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1025               xdata[0]     = tmp;
1026               mem_estimate = new_estimate; ++no_malloc;
1027               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1028             }
1029             xdata[i][ct2++] = val;
1030             ct3++;
1031           }
1032         }
1033       }
1034       /* Update the header*/
1035       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1036       xdata[i][2*j-1] = rbuf_i[2*j-1];
1037     }
1038     xdata[i][0] = rbuf_0;
1039     xdata[i+1]  = xdata[i] + ct2;
1040     isz1[i]     = ct2; /* size of each message */
1041   }
1042   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1043   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1044   PetscFunctionReturn(0);
1045 }
1046 /* -------------------------------------------------------------------------*/
1047 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1048 /*
1049     Every processor gets the entire matrix
1050 */
1051 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1052 {
1053   Mat            B;
1054   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1055   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1056   PetscErrorCode ierr;
1057   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1058   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1059   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1060   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1061 
1062   PetscFunctionBegin;
1063   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1064   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1065 
1066   if (scall == MAT_INITIAL_MATRIX) {
1067     /* ----------------------------------------------------------------
1068          Tell every processor the number of nonzeros per row
1069     */
1070     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1071     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1072       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1073     }
1074     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1075     for (i=0; i<size; i++) {
1076       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1077       displs[i]     = A->rmap->range[i];
1078     }
1079 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1080     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1081 #else
1082     sendcount = A->rmap->rend - A->rmap->rstart;
1083     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1084 #endif
1085     /* ---------------------------------------------------------------
1086          Create the sequential matrix of the same type as the local block diagonal
1087     */
1088     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1089     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1090     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1091     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1092     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1093     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1094     **Bin = B;
1095     b     = (Mat_SeqAIJ*)B->data;
1096 
1097     /*--------------------------------------------------------------------
1098        Copy my part of matrix column indices over
1099     */
1100     sendcount  = ad->nz + bd->nz;
1101     jsendbuf   = b->j + b->i[rstarts[rank]];
1102     a_jsendbuf = ad->j;
1103     b_jsendbuf = bd->j;
1104     n          = A->rmap->rend - A->rmap->rstart;
1105     cnt        = 0;
1106     for (i=0; i<n; i++) {
1107 
1108       /* put in lower diagonal portion */
1109       m = bd->i[i+1] - bd->i[i];
1110       while (m > 0) {
1111         /* is it above diagonal (in bd (compressed) numbering) */
1112         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1113         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1114         m--;
1115       }
1116 
1117       /* put in diagonal portion */
1118       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1119         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1120       }
1121 
1122       /* put in upper diagonal portion */
1123       while (m-- > 0) {
1124         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1125       }
1126     }
1127     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1128 
1129     /*--------------------------------------------------------------------
1130        Gather all column indices to all processors
1131     */
1132     for (i=0; i<size; i++) {
1133       recvcounts[i] = 0;
1134       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1135         recvcounts[i] += lens[j];
1136       }
1137     }
1138     displs[0] = 0;
1139     for (i=1; i<size; i++) {
1140       displs[i] = displs[i-1] + recvcounts[i-1];
1141     }
1142 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1143     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1144 #else
1145     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1146 #endif
1147     /*--------------------------------------------------------------------
1148         Assemble the matrix into useable form (note numerical values not yet set)
1149     */
1150     /* set the b->ilen (length of each row) values */
1151     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1152     /* set the b->i indices */
1153     b->i[0] = 0;
1154     for (i=1; i<=A->rmap->N; i++) {
1155       b->i[i] = b->i[i-1] + lens[i-1];
1156     }
1157     ierr = PetscFree(lens);CHKERRQ(ierr);
1158     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1159     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1160 
1161   } else {
1162     B = **Bin;
1163     b = (Mat_SeqAIJ*)B->data;
1164   }
1165 
1166   /*--------------------------------------------------------------------
1167        Copy my part of matrix numerical values into the values location
1168   */
1169   if (flag == MAT_GET_VALUES) {
1170     sendcount = ad->nz + bd->nz;
1171     sendbuf   = b->a + b->i[rstarts[rank]];
1172     a_sendbuf = ad->a;
1173     b_sendbuf = bd->a;
1174     b_sendj   = bd->j;
1175     n         = A->rmap->rend - A->rmap->rstart;
1176     cnt       = 0;
1177     for (i=0; i<n; i++) {
1178 
1179       /* put in lower diagonal portion */
1180       m = bd->i[i+1] - bd->i[i];
1181       while (m > 0) {
1182         /* is it above diagonal (in bd (compressed) numbering) */
1183         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1184         sendbuf[cnt++] = *b_sendbuf++;
1185         m--;
1186         b_sendj++;
1187       }
1188 
1189       /* put in diagonal portion */
1190       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1191         sendbuf[cnt++] = *a_sendbuf++;
1192       }
1193 
1194       /* put in upper diagonal portion */
1195       while (m-- > 0) {
1196         sendbuf[cnt++] = *b_sendbuf++;
1197         b_sendj++;
1198       }
1199     }
1200     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1201 
1202     /* -----------------------------------------------------------------
1203        Gather all numerical values to all processors
1204     */
1205     if (!recvcounts) {
1206       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1207     }
1208     for (i=0; i<size; i++) {
1209       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1210     }
1211     displs[0] = 0;
1212     for (i=1; i<size; i++) {
1213       displs[i] = displs[i-1] + recvcounts[i-1];
1214     }
1215     recvbuf = b->a;
1216 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1217     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1218 #else
1219     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1220 #endif
1221   }  /* endof (flag == MAT_GET_VALUES) */
1222   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1223 
1224   if (A->symmetric) {
1225     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1226   } else if (A->hermitian) {
1227     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1228   } else if (A->structurally_symmetric) {
1229     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1230   }
1231   PetscFunctionReturn(0);
1232 }
1233 
1234 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1235 {
1236   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1237   Mat            submat,A = c->A,B = c->B;
1238   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1239   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1240   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1241   const PetscInt *icol,*irow;
1242   PetscInt       nrow,ncol,start;
1243   PetscErrorCode ierr;
1244   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1245   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1246   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1247   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1248   PetscInt       *lens,rmax,ncols,*cols,Crow;
1249 #if defined(PETSC_USE_CTABLE)
1250   PetscTable     cmap,rmap;
1251   PetscInt       *cmap_loc,*rmap_loc;
1252 #else
1253   PetscInt       *cmap,*rmap;
1254 #endif
1255   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1256   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1257   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1258   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1259   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1260   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1261   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1262   MPI_Comm       comm;
1263   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1264   PetscMPIInt    *onodes1,*olengths1,idex,end;
1265   Mat_SubSppt    *smatis1;
1266   PetscBool      isrowsorted,iscolsorted;
1267 
1268   PetscFunctionBegin;
1269   PetscValidLogicalCollectiveInt(C,ismax,2);
1270   PetscValidLogicalCollectiveEnum(C,scall,5);
1271   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1272 
1273   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1274   size = c->size;
1275   rank = c->rank;
1276 
1277   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1278   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1279   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1280   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1281   if (allcolumns) {
1282     icol = NULL;
1283     ncol = C->cmap->N;
1284   } else {
1285     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1286     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1287   }
1288 
1289   if (scall == MAT_INITIAL_MATRIX) {
1290     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1291 
1292     /* Get some new tags to keep the communication clean */
1293     tag1 = ((PetscObject)C)->tag;
1294     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1295     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1296 
1297     /* evaluate communication - mesg to who, length of mesg, and buffer space
1298      required. Based on this, buffers are allocated, and data copied into them */
1299     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1300     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1301 
1302     /* w1[proc] = num of rows owned by proc -- to be requested */
1303     proc = 0;
1304     nrqs = 0; /* num of outgoing messages */
1305     for (j=0; j<nrow; j++) {
1306       row  = irow[j];
1307       if (!isrowsorted) proc = 0;
1308       while (row >= C->rmap->range[proc+1]) proc++;
1309       w1[proc]++;
1310       row2proc[j] = proc; /* map row index to proc */
1311 
1312       if (proc != rank && !w2[proc]) {
1313         w2[proc] = 1; nrqs++;
1314       }
1315     }
1316     w1[rank] = 0;  /* rows owned by self will not be requested */
1317 
1318     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1319     for (proc=0,j=0; proc<size; proc++) {
1320       if (w1[proc]) { pa[j++] = proc;}
1321     }
1322 
1323     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1324     msz = 0;              /* total mesg length (for all procs) */
1325     for (i=0; i<nrqs; i++) {
1326       proc      = pa[i];
1327       w1[proc] += 3;
1328       msz      += w1[proc];
1329     }
1330     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1331 
1332     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1333     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1334     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1335 
1336     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1337        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1338     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1339 
1340     /* Now post the Irecvs corresponding to these messages */
1341     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1342 
1343     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1344     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1345 
1346     /* Allocate Memory for outgoing messages */
1347     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1348     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1349     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1350 
1351     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1352     iptr = tmp;
1353     for (i=0; i<nrqs; i++) {
1354       proc        = pa[i];
1355       sbuf1[proc] = iptr;
1356       iptr       += w1[proc];
1357     }
1358 
1359     /* Form the outgoing messages */
1360     /* Initialize the header space */
1361     for (i=0; i<nrqs; i++) {
1362       proc      = pa[i];
1363       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1364       ptr[proc] = sbuf1[proc] + 3;
1365     }
1366 
1367     /* Parse the isrow and copy data into outbuf */
1368     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1369     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1370       proc = row2proc[j];
1371       if (proc != rank) { /* copy to the outgoing buf*/
1372         *ptr[proc] = irow[j];
1373         ctr[proc]++; ptr[proc]++;
1374       }
1375     }
1376 
1377     /* Update the headers for the current IS */
1378     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1379       if ((ctr_j = ctr[j])) {
1380         sbuf1_j        = sbuf1[j];
1381         k              = ++sbuf1_j[0];
1382         sbuf1_j[2*k]   = ctr_j;
1383         sbuf1_j[2*k-1] = 0;
1384       }
1385     }
1386 
1387     /* Now post the sends */
1388     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1389     for (i=0; i<nrqs; ++i) {
1390       proc = pa[i];
1391       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1392     }
1393 
1394     /* Post Receives to capture the buffer size */
1395     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1396     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1397 
1398     rbuf2[0] = tmp + msz;
1399     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1400 
1401     for (i=0; i<nrqs; ++i) {
1402       proc = pa[i];
1403       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1404     }
1405 
1406     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1407 
1408     /* Send to other procs the buf size they should allocate */
1409     /* Receive messages*/
1410     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1411     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1412 
1413     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1414     for (i=0; i<nrqr; ++i) {
1415       req_size[i] = 0;
1416       rbuf1_i        = rbuf1[i];
1417       start          = 2*rbuf1_i[0] + 1;
1418       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1419       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1420       sbuf2_i        = sbuf2[i];
1421       for (j=start; j<end; j++) {
1422         k            = rbuf1_i[j] - rstart;
1423         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1424         sbuf2_i[j]   = ncols;
1425         req_size[i] += ncols;
1426       }
1427       req_source1[i] = r_status1[i].MPI_SOURCE;
1428 
1429       /* form the header */
1430       sbuf2_i[0] = req_size[i];
1431       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1432 
1433       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1434     }
1435 
1436     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1437     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1438 
1439     /* rbuf2 is received, Post recv column indices a->j */
1440     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1441 
1442     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1443     for (i=0; i<nrqs; ++i) {
1444       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1445       req_source2[i] = r_status2[i].MPI_SOURCE;
1446       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1447     }
1448 
1449     /* Wait on sends1 and sends2 */
1450     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1451     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1452     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1453     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1454 
1455     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1456     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1457 
1458     /* Now allocate sending buffers for a->j, and send them off */
1459     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1460     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1461     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1462     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1463 
1464     for (i=0; i<nrqr; i++) { /* for each requested message */
1465       rbuf1_i   = rbuf1[i];
1466       sbuf_aj_i = sbuf_aj[i];
1467       ct1       = 2*rbuf1_i[0] + 1;
1468       ct2       = 0;
1469       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1470 
1471       kmax = rbuf1[i][2];
1472       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1473         row    = rbuf1_i[ct1] - rstart;
1474         nzA    = ai[row+1] - ai[row];
1475         nzB    = bi[row+1] - bi[row];
1476         ncols  = nzA + nzB;
1477         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1478 
1479         /* load the column indices for this row into cols*/
1480         cols = sbuf_aj_i + ct2;
1481 
1482         lwrite = 0;
1483         for (l=0; l<nzB; l++) {
1484           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1485         }
1486         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1487         for (l=0; l<nzB; l++) {
1488           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1489         }
1490 
1491         ct2 += ncols;
1492       }
1493       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1494     }
1495 
1496     /* create column map (cmap): global col of C -> local col of submat */
1497 #if defined(PETSC_USE_CTABLE)
1498     if (!allcolumns) {
1499       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1500       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1501       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1502         if (icol[j] >= cstart && icol[j] <cend) {
1503           cmap_loc[icol[j] - cstart] = j+1;
1504         } else { /* use PetscTable for non-local col indices */
1505           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1506         }
1507       }
1508     } else {
1509       cmap     = NULL;
1510       cmap_loc = NULL;
1511     }
1512     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1513 #else
1514     if (!allcolumns) {
1515       ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1516       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1517     } else {
1518       cmap = NULL;
1519     }
1520 #endif
1521 
1522     /* Create lens for MatSeqAIJSetPreallocation() */
1523     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1524 
1525     /* Compute lens from local part of C */
1526     for (j=0; j<nrow; j++) {
1527       row  = irow[j];
1528       proc = row2proc[j];
1529       if (proc == rank) {
1530         /* diagonal part A = c->A */
1531         ncols = ai[row-rstart+1] - ai[row-rstart];
1532         cols  = aj + ai[row-rstart];
1533         if (!allcolumns) {
1534           for (k=0; k<ncols; k++) {
1535 #if defined(PETSC_USE_CTABLE)
1536             tcol = cmap_loc[cols[k]];
1537 #else
1538             tcol = cmap[cols[k]+cstart];
1539 #endif
1540             if (tcol) lens[j]++;
1541           }
1542         } else { /* allcolumns */
1543           lens[j] = ncols;
1544         }
1545 
1546         /* off-diagonal part B = c->B */
1547         ncols = bi[row-rstart+1] - bi[row-rstart];
1548         cols  = bj + bi[row-rstart];
1549         if (!allcolumns) {
1550           for (k=0; k<ncols; k++) {
1551 #if defined(PETSC_USE_CTABLE)
1552             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1553 #else
1554             tcol = cmap[bmap[cols[k]]];
1555 #endif
1556             if (tcol) lens[j]++;
1557           }
1558         } else { /* allcolumns */
1559           lens[j] += ncols;
1560         }
1561       }
1562     }
1563 
1564     /* Create row map (rmap): global row of C -> local row of submat */
1565 #if defined(PETSC_USE_CTABLE)
1566     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1567     for (j=0; j<nrow; j++) {
1568       row  = irow[j];
1569       proc = row2proc[j];
1570       if (proc == rank) { /* a local row */
1571         rmap_loc[row - rstart] = j;
1572       } else {
1573         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1574       }
1575     }
1576 #else
1577     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1578     for (j=0; j<nrow; j++) {
1579       rmap[irow[j]] = j;
1580     }
1581 #endif
1582 
1583     /* Update lens from offproc data */
1584     /* recv a->j is done */
1585     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1586     for (i=0; i<nrqs; i++) {
1587       proc    = pa[i];
1588       sbuf1_i = sbuf1[proc];
1589       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1590       ct1     = 2 + 1;
1591       ct2     = 0;
1592       rbuf2_i = rbuf2[i]; /* received length of C->j */
1593       rbuf3_i = rbuf3[i]; /* received C->j */
1594 
1595       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1596       max1   = sbuf1_i[2];
1597       for (k=0; k<max1; k++,ct1++) {
1598 #if defined(PETSC_USE_CTABLE)
1599         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1600         row--;
1601         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1602 #else
1603         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1604 #endif
1605         /* Now, store row index of submat in sbuf1_i[ct1] */
1606         sbuf1_i[ct1] = row;
1607 
1608         nnz = rbuf2_i[ct1];
1609         if (!allcolumns) {
1610           for (l=0; l<nnz; l++,ct2++) {
1611 #if defined(PETSC_USE_CTABLE)
1612             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1613               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1614             } else {
1615               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1616             }
1617 #else
1618             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1619 #endif
1620             if (tcol) lens[row]++;
1621           }
1622         } else { /* allcolumns */
1623           lens[row] += nnz;
1624         }
1625       }
1626     }
1627     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1628     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1629 
1630     /* Create the submatrices */
1631     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1632     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1633 
1634     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1635     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1636     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1637     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1638     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1639 
1640     /* create struct Mat_SubSppt and attached it to submat */
1641     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1642     subc = (Mat_SeqAIJ*)submat->data;
1643     subc->submatis1 = smatis1;
1644 
1645     smatis1->id          = 0;
1646     smatis1->nrqs        = nrqs;
1647     smatis1->nrqr        = nrqr;
1648     smatis1->rbuf1       = rbuf1;
1649     smatis1->rbuf2       = rbuf2;
1650     smatis1->rbuf3       = rbuf3;
1651     smatis1->sbuf2       = sbuf2;
1652     smatis1->req_source2 = req_source2;
1653 
1654     smatis1->sbuf1       = sbuf1;
1655     smatis1->ptr         = ptr;
1656     smatis1->tmp         = tmp;
1657     smatis1->ctr         = ctr;
1658 
1659     smatis1->pa           = pa;
1660     smatis1->req_size     = req_size;
1661     smatis1->req_source1  = req_source1;
1662 
1663     smatis1->allcolumns  = allcolumns;
1664     smatis1->singleis    = PETSC_TRUE;
1665     smatis1->row2proc    = row2proc;
1666     smatis1->rmap        = rmap;
1667     smatis1->cmap        = cmap;
1668 #if defined(PETSC_USE_CTABLE)
1669     smatis1->rmap_loc    = rmap_loc;
1670     smatis1->cmap_loc    = cmap_loc;
1671 #endif
1672 
1673     smatis1->destroy     = submat->ops->destroy;
1674     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1675     submat->factortype   = C->factortype;
1676 
1677     /* compute rmax */
1678     rmax = 0;
1679     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1680 
1681   } else { /* scall == MAT_REUSE_MATRIX */
1682     submat = submats[0];
1683     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1684 
1685     subc    = (Mat_SeqAIJ*)submat->data;
1686     rmax    = subc->rmax;
1687     smatis1 = subc->submatis1;
1688     nrqs        = smatis1->nrqs;
1689     nrqr        = smatis1->nrqr;
1690     rbuf1       = smatis1->rbuf1;
1691     rbuf2       = smatis1->rbuf2;
1692     rbuf3       = smatis1->rbuf3;
1693     req_source2 = smatis1->req_source2;
1694 
1695     sbuf1     = smatis1->sbuf1;
1696     sbuf2     = smatis1->sbuf2;
1697     ptr       = smatis1->ptr;
1698     tmp       = smatis1->tmp;
1699     ctr       = smatis1->ctr;
1700 
1701     pa         = smatis1->pa;
1702     req_size   = smatis1->req_size;
1703     req_source1 = smatis1->req_source1;
1704 
1705     allcolumns = smatis1->allcolumns;
1706     row2proc   = smatis1->row2proc;
1707     rmap       = smatis1->rmap;
1708     cmap       = smatis1->cmap;
1709 #if defined(PETSC_USE_CTABLE)
1710     rmap_loc   = smatis1->rmap_loc;
1711     cmap_loc   = smatis1->cmap_loc;
1712 #endif
1713   }
1714 
1715   /* Post recv matrix values */
1716   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1717   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1718   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1719   for (i=0; i<nrqs; ++i) {
1720     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1721     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1722   }
1723 
1724   /* Allocate sending buffers for a->a, and send them off */
1725   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1726   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1727   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1728   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1729 
1730   for (i=0; i<nrqr; i++) {
1731     rbuf1_i   = rbuf1[i];
1732     sbuf_aa_i = sbuf_aa[i];
1733     ct1       = 2*rbuf1_i[0]+1;
1734     ct2       = 0;
1735     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1736 
1737     kmax = rbuf1_i[2];
1738     for (k=0; k<kmax; k++,ct1++) {
1739       row = rbuf1_i[ct1] - rstart;
1740       nzA = ai[row+1] - ai[row];
1741       nzB = bi[row+1] - bi[row];
1742       ncols  = nzA + nzB;
1743       cworkB = bj + bi[row];
1744       vworkA = a_a + ai[row];
1745       vworkB = b_a + bi[row];
1746 
1747       /* load the column values for this row into vals*/
1748       vals = sbuf_aa_i + ct2;
1749 
1750       lwrite = 0;
1751       for (l=0; l<nzB; l++) {
1752         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1753       }
1754       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1755       for (l=0; l<nzB; l++) {
1756         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1757       }
1758 
1759       ct2 += ncols;
1760     }
1761     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1762   }
1763 
1764   /* Assemble submat */
1765   /* First assemble the local rows */
1766   for (j=0; j<nrow; j++) {
1767     row  = irow[j];
1768     proc = row2proc[j];
1769     if (proc == rank) {
1770       Crow = row - rstart;  /* local row index of C */
1771 #if defined(PETSC_USE_CTABLE)
1772       row = rmap_loc[Crow]; /* row index of submat */
1773 #else
1774       row = rmap[row];
1775 #endif
1776 
1777       if (allcolumns) {
1778         /* diagonal part A = c->A */
1779         ncols = ai[Crow+1] - ai[Crow];
1780         cols  = aj + ai[Crow];
1781         vals  = a->a + ai[Crow];
1782         i     = 0;
1783         for (k=0; k<ncols; k++) {
1784           subcols[i]   = cols[k] + cstart;
1785           subvals[i++] = vals[k];
1786         }
1787 
1788         /* off-diagonal part B = c->B */
1789         ncols = bi[Crow+1] - bi[Crow];
1790         cols  = bj + bi[Crow];
1791         vals  = b->a + bi[Crow];
1792         for (k=0; k<ncols; k++) {
1793           subcols[i]   = bmap[cols[k]];
1794           subvals[i++] = vals[k];
1795         }
1796 
1797         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1798 
1799       } else { /* !allcolumns */
1800 #if defined(PETSC_USE_CTABLE)
1801         /* diagonal part A = c->A */
1802         ncols = ai[Crow+1] - ai[Crow];
1803         cols  = aj + ai[Crow];
1804         vals  = a->a + ai[Crow];
1805         i     = 0;
1806         for (k=0; k<ncols; k++) {
1807           tcol = cmap_loc[cols[k]];
1808           if (tcol) {
1809             subcols[i]   = --tcol;
1810             subvals[i++] = vals[k];
1811           }
1812         }
1813 
1814         /* off-diagonal part B = c->B */
1815         ncols = bi[Crow+1] - bi[Crow];
1816         cols  = bj + bi[Crow];
1817         vals  = b->a + bi[Crow];
1818         for (k=0; k<ncols; k++) {
1819           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1820           if (tcol) {
1821             subcols[i]   = --tcol;
1822             subvals[i++] = vals[k];
1823           }
1824         }
1825 #else
1826         /* diagonal part A = c->A */
1827         ncols = ai[Crow+1] - ai[Crow];
1828         cols  = aj + ai[Crow];
1829         vals  = a->a + ai[Crow];
1830         i     = 0;
1831         for (k=0; k<ncols; k++) {
1832           tcol = cmap[cols[k]+cstart];
1833           if (tcol) {
1834             subcols[i]   = --tcol;
1835             subvals[i++] = vals[k];
1836           }
1837         }
1838 
1839         /* off-diagonal part B = c->B */
1840         ncols = bi[Crow+1] - bi[Crow];
1841         cols  = bj + bi[Crow];
1842         vals  = b->a + bi[Crow];
1843         for (k=0; k<ncols; k++) {
1844           tcol = cmap[bmap[cols[k]]];
1845           if (tcol) {
1846             subcols[i]   = --tcol;
1847             subvals[i++] = vals[k];
1848           }
1849         }
1850 #endif
1851         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1852       }
1853     }
1854   }
1855 
1856   /* Now assemble the off-proc rows */
1857   for (i=0; i<nrqs; i++) { /* for each requested message */
1858     /* recv values from other processes */
1859     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1860     proc    = pa[idex];
1861     sbuf1_i = sbuf1[proc];
1862     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1863     ct1     = 2 + 1;
1864     ct2     = 0; /* count of received C->j */
1865     ct3     = 0; /* count of received C->j that will be inserted into submat */
1866     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1867     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1868     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1869 
1870     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1871     max1 = sbuf1_i[2];             /* num of rows */
1872     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1873       row = sbuf1_i[ct1]; /* row index of submat */
1874       if (!allcolumns) {
1875         idex = 0;
1876         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1877           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1878           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1879 #if defined(PETSC_USE_CTABLE)
1880             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1881               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1882             } else {
1883               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1884             }
1885 #else
1886             tcol = cmap[rbuf3_i[ct2]];
1887 #endif
1888             if (tcol) {
1889               subcols[idex]   = --tcol; /* may not be sorted */
1890               subvals[idex++] = rbuf4_i[ct2];
1891 
1892               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1893                For reuse, we replace received C->j with index that should be inserted to submat */
1894               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1895             }
1896           }
1897           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1898         } else { /* scall == MAT_REUSE_MATRIX */
1899           submat = submats[0];
1900           subc   = (Mat_SeqAIJ*)submat->data;
1901 
1902           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1903           for (l=0; l<nnz; l++) {
1904             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1905             subvals[idex++] = rbuf4_i[ct2];
1906           }
1907 
1908           bj = subc->j + subc->i[row]; /* sorted column indices */
1909           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1910         }
1911       } else { /* allcolumns */
1912         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1913         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1914         ct2 += nnz;
1915       }
1916     }
1917   }
1918 
1919   /* sending a->a are done */
1920   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1921   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1922 
1923   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1924   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1925   submats[0] = submat;
1926 
1927   /* Restore the indices */
1928   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1929   if (!allcolumns) {
1930     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1931   }
1932 
1933   /* Destroy allocated memory */
1934   for (i=0; i<nrqs; ++i) {
1935     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1936   }
1937   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1938   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1939   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1940 
1941   if (scall == MAT_INITIAL_MATRIX) {
1942     ierr = PetscFree(lens);CHKERRQ(ierr);
1943     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1944     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1945   }
1946   PetscFunctionReturn(0);
1947 }
1948 
1949 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1950 {
1951   PetscErrorCode ierr;
1952   PetscInt       ncol;
1953   PetscBool      colflag,allcolumns=PETSC_FALSE;
1954 
1955   PetscFunctionBegin;
1956   /* Allocate memory to hold all the submatrices */
1957   if (scall == MAT_INITIAL_MATRIX) {
1958     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1959   }
1960 
1961   /* Check for special case: each processor gets entire matrix columns */
1962   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1963   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1964   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1965 
1966   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1967   PetscFunctionReturn(0);
1968 }
1969 
1970 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1971 {
1972   PetscErrorCode ierr;
1973   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1974   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1975   Mat_SeqAIJ     *subc;
1976   Mat_SubSppt    *smat;
1977 
1978   PetscFunctionBegin;
1979   /* Check for special case: each processor has a single IS */
1980   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1981     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1982     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1983     PetscFunctionReturn(0);
1984   }
1985 
1986   /* Collect global wantallmatrix and nstages */
1987   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1988   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1989   if (!nmax) nmax = 1;
1990 
1991   if (scall == MAT_INITIAL_MATRIX) {
1992     /* Collect global wantallmatrix and nstages */
1993     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1994       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1995       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1996       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1997       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1998       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1999         wantallmatrix = PETSC_TRUE;
2000 
2001         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2002       }
2003     }
2004 
2005     /* Determine the number of stages through which submatrices are done
2006        Each stage will extract nmax submatrices.
2007        nmax is determined by the matrix column dimension.
2008        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2009     */
2010     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2011 
2012     in[0] = -1*(PetscInt)wantallmatrix;
2013     in[1] = nstages;
2014     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2015     wantallmatrix = (PetscBool)(-out[0]);
2016     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2017 
2018   } else { /* MAT_REUSE_MATRIX */
2019     if (ismax) {
2020       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2021       smat = subc->submatis1;
2022     } else { /* (*submat)[0] is a dummy matrix */
2023       smat = (Mat_SubSppt*)(*submat)[0]->data;
2024     }
2025     if (!smat) {
2026       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2027       wantallmatrix = PETSC_TRUE;
2028     } else if (smat->singleis) {
2029       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2030       PetscFunctionReturn(0);
2031     } else {
2032       nstages = smat->nstages;
2033     }
2034   }
2035 
2036   if (wantallmatrix) {
2037     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2038     PetscFunctionReturn(0);
2039   }
2040 
2041   /* Allocate memory to hold all the submatrices and dummy submatrices */
2042   if (scall == MAT_INITIAL_MATRIX) {
2043     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2044   }
2045 
2046   for (i=0,pos=0; i<nstages; i++) {
2047     if (pos+nmax <= ismax) max_no = nmax;
2048     else if (pos >= ismax) max_no = 0;
2049     else                   max_no = ismax-pos;
2050 
2051     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2052     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2053       smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2054       smat->nstages = nstages;
2055     }
2056     pos += max_no;
2057   }
2058 
2059   if (ismax && scall == MAT_INITIAL_MATRIX) {
2060     /* save nstages for reuse */
2061     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2062     smat = subc->submatis1;
2063     smat->nstages = nstages;
2064   }
2065   PetscFunctionReturn(0);
2066 }
2067 
2068 /* -------------------------------------------------------------------------*/
2069 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2070 {
2071   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2072   Mat            A  = c->A;
2073   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2074   const PetscInt **icol,**irow;
2075   PetscInt       *nrow,*ncol,start;
2076   PetscErrorCode ierr;
2077   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2078   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2079   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2080   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2081   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2082 #if defined(PETSC_USE_CTABLE)
2083   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2084 #else
2085   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2086 #endif
2087   const PetscInt *irow_i;
2088   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2089   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2090   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2091   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2092   MPI_Status     *r_status3,*r_status4,*s_status4;
2093   MPI_Comm       comm;
2094   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2095   PetscMPIInt    *onodes1,*olengths1,end;
2096   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2097   Mat_SubSppt    *smat_i;
2098   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2099   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2100 
2101   PetscFunctionBegin;
2102   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2103   size = c->size;
2104   rank = c->rank;
2105 
2106   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2107   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2108 
2109   for (i=0; i<ismax; i++) {
2110     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2111     if (!issorted[i]) iscsorted = issorted[i];
2112 
2113     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2114 
2115     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2116     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2117 
2118     /* Check for special case: allcolumn */
2119     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2120     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2121     if (colflag && ncol[i] == C->cmap->N) {
2122       allcolumns[i] = PETSC_TRUE;
2123       icol[i] = NULL;
2124     } else {
2125       allcolumns[i] = PETSC_FALSE;
2126       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2127     }
2128   }
2129 
2130   if (scall == MAT_REUSE_MATRIX) {
2131     /* Assumes new rows are same length as the old rows */
2132     for (i=0; i<ismax; i++) {
2133       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2134       subc = (Mat_SeqAIJ*)submats[i]->data;
2135       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2136 
2137       /* Initial matrix as if empty */
2138       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2139 
2140       smat_i   = subc->submatis1;
2141 
2142       nrqs        = smat_i->nrqs;
2143       nrqr        = smat_i->nrqr;
2144       rbuf1       = smat_i->rbuf1;
2145       rbuf2       = smat_i->rbuf2;
2146       rbuf3       = smat_i->rbuf3;
2147       req_source2 = smat_i->req_source2;
2148 
2149       sbuf1     = smat_i->sbuf1;
2150       sbuf2     = smat_i->sbuf2;
2151       ptr       = smat_i->ptr;
2152       tmp       = smat_i->tmp;
2153       ctr       = smat_i->ctr;
2154 
2155       pa          = smat_i->pa;
2156       req_size    = smat_i->req_size;
2157       req_source1 = smat_i->req_source1;
2158 
2159       allcolumns[i] = smat_i->allcolumns;
2160       row2proc[i]   = smat_i->row2proc;
2161       rmap[i]       = smat_i->rmap;
2162       cmap[i]       = smat_i->cmap;
2163     }
2164 
2165     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2166       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2167       smat_i = (Mat_SubSppt*)submats[0]->data;
2168 
2169       nrqs        = smat_i->nrqs;
2170       nrqr        = smat_i->nrqr;
2171       rbuf1       = smat_i->rbuf1;
2172       rbuf2       = smat_i->rbuf2;
2173       rbuf3       = smat_i->rbuf3;
2174       req_source2 = smat_i->req_source2;
2175 
2176       sbuf1       = smat_i->sbuf1;
2177       sbuf2       = smat_i->sbuf2;
2178       ptr         = smat_i->ptr;
2179       tmp         = smat_i->tmp;
2180       ctr         = smat_i->ctr;
2181 
2182       pa          = smat_i->pa;
2183       req_size    = smat_i->req_size;
2184       req_source1 = smat_i->req_source1;
2185 
2186       allcolumns[0] = PETSC_FALSE;
2187     }
2188   } else { /* scall == MAT_INITIAL_MATRIX */
2189     /* Get some new tags to keep the communication clean */
2190     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2191     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2192 
2193     /* evaluate communication - mesg to who, length of mesg, and buffer space
2194      required. Based on this, buffers are allocated, and data copied into them*/
2195     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2196 
2197     for (i=0; i<ismax; i++) {
2198       jmax   = nrow[i];
2199       irow_i = irow[i];
2200 
2201       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2202       row2proc[i] = row2proc_i;
2203 
2204       if (issorted[i]) proc = 0;
2205       for (j=0; j<jmax; j++) {
2206         if (!issorted[i]) proc = 0;
2207         row = irow_i[j];
2208         while (row >= C->rmap->range[proc+1]) proc++;
2209         w4[proc]++;
2210         row2proc_i[j] = proc; /* map row index to proc */
2211       }
2212       for (j=0; j<size; j++) {
2213         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2214       }
2215     }
2216 
2217     nrqs     = 0;              /* no of outgoing messages */
2218     msz      = 0;              /* total mesg length (for all procs) */
2219     w1[rank] = 0;              /* no mesg sent to self */
2220     w3[rank] = 0;
2221     for (i=0; i<size; i++) {
2222       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2223     }
2224     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2225     for (i=0,j=0; i<size; i++) {
2226       if (w1[i]) { pa[j] = i; j++; }
2227     }
2228 
2229     /* Each message would have a header = 1 + 2*(no of IS) + data */
2230     for (i=0; i<nrqs; i++) {
2231       j      = pa[i];
2232       w1[j] += w2[j] + 2* w3[j];
2233       msz   += w1[j];
2234     }
2235     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2236 
2237     /* Determine the number of messages to expect, their lengths, from from-ids */
2238     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2239     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2240 
2241     /* Now post the Irecvs corresponding to these messages */
2242     tag0 = ((PetscObject)C)->tag;
2243     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2244 
2245     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2246     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2247 
2248     /* Allocate Memory for outgoing messages */
2249     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2250     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2251     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2252 
2253     {
2254       PetscInt *iptr = tmp;
2255       k    = 0;
2256       for (i=0; i<nrqs; i++) {
2257         j        = pa[i];
2258         iptr    += k;
2259         sbuf1[j] = iptr;
2260         k        = w1[j];
2261       }
2262     }
2263 
2264     /* Form the outgoing messages. Initialize the header space */
2265     for (i=0; i<nrqs; i++) {
2266       j           = pa[i];
2267       sbuf1[j][0] = 0;
2268       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2269       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2270     }
2271 
2272     /* Parse the isrow and copy data into outbuf */
2273     for (i=0; i<ismax; i++) {
2274       row2proc_i = row2proc[i];
2275       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2276       irow_i = irow[i];
2277       jmax   = nrow[i];
2278       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2279         proc = row2proc_i[j];
2280         if (proc != rank) { /* copy to the outgoing buf*/
2281           ctr[proc]++;
2282           *ptr[proc] = irow_i[j];
2283           ptr[proc]++;
2284         }
2285       }
2286       /* Update the headers for the current IS */
2287       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2288         if ((ctr_j = ctr[j])) {
2289           sbuf1_j        = sbuf1[j];
2290           k              = ++sbuf1_j[0];
2291           sbuf1_j[2*k]   = ctr_j;
2292           sbuf1_j[2*k-1] = i;
2293         }
2294       }
2295     }
2296 
2297     /*  Now  post the sends */
2298     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2299     for (i=0; i<nrqs; ++i) {
2300       j    = pa[i];
2301       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2302     }
2303 
2304     /* Post Receives to capture the buffer size */
2305     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2306     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2307     rbuf2[0] = tmp + msz;
2308     for (i=1; i<nrqs; ++i) {
2309       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2310     }
2311     for (i=0; i<nrqs; ++i) {
2312       j    = pa[i];
2313       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2314     }
2315 
2316     /* Send to other procs the buf size they should allocate */
2317     /* Receive messages*/
2318     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2319     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2320     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2321     {
2322       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2323       PetscInt   *sbuf2_i;
2324 
2325       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2326       for (i=0; i<nrqr; ++i) {
2327         req_size[i] = 0;
2328         rbuf1_i        = rbuf1[i];
2329         start          = 2*rbuf1_i[0] + 1;
2330         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2331         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2332         sbuf2_i        = sbuf2[i];
2333         for (j=start; j<end; j++) {
2334           id              = rbuf1_i[j] - rstart;
2335           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2336           sbuf2_i[j]      = ncols;
2337           req_size[i] += ncols;
2338         }
2339         req_source1[i] = r_status1[i].MPI_SOURCE;
2340         /* form the header */
2341         sbuf2_i[0] = req_size[i];
2342         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2343 
2344         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2345       }
2346     }
2347     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2348     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2349     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2350 
2351     /* Receive messages*/
2352     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2353     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2354 
2355     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2356     for (i=0; i<nrqs; ++i) {
2357       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2358       req_source2[i] = r_status2[i].MPI_SOURCE;
2359       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2360     }
2361     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2362     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2363 
2364     /* Wait on sends1 and sends2 */
2365     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2366     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2367 
2368     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2369     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2370     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2371     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2372     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2373     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2374 
2375     /* Now allocate sending buffers for a->j, and send them off */
2376     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2377     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2378     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2379     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2380 
2381     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2382     {
2383       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2384       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2385       PetscInt cend = C->cmap->rend;
2386       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2387 
2388       for (i=0; i<nrqr; i++) {
2389         rbuf1_i   = rbuf1[i];
2390         sbuf_aj_i = sbuf_aj[i];
2391         ct1       = 2*rbuf1_i[0] + 1;
2392         ct2       = 0;
2393         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2394           kmax = rbuf1[i][2*j];
2395           for (k=0; k<kmax; k++,ct1++) {
2396             row    = rbuf1_i[ct1] - rstart;
2397             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2398             ncols  = nzA + nzB;
2399             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2400 
2401             /* load the column indices for this row into cols */
2402             cols = sbuf_aj_i + ct2;
2403 
2404             lwrite = 0;
2405             for (l=0; l<nzB; l++) {
2406               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2407             }
2408             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2409             for (l=0; l<nzB; l++) {
2410               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2411             }
2412 
2413             ct2 += ncols;
2414           }
2415         }
2416         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2417       }
2418     }
2419     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2420 
2421     /* create col map: global col of C -> local col of submatrices */
2422     {
2423       const PetscInt *icol_i;
2424 #if defined(PETSC_USE_CTABLE)
2425       for (i=0; i<ismax; i++) {
2426         if (!allcolumns[i]) {
2427           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2428 
2429           jmax   = ncol[i];
2430           icol_i = icol[i];
2431           cmap_i = cmap[i];
2432           for (j=0; j<jmax; j++) {
2433             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2434           }
2435         } else cmap[i] = NULL;
2436       }
2437 #else
2438       for (i=0; i<ismax; i++) {
2439         if (!allcolumns[i]) {
2440           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2441           jmax   = ncol[i];
2442           icol_i = icol[i];
2443           cmap_i = cmap[i];
2444           for (j=0; j<jmax; j++) {
2445             cmap_i[icol_i[j]] = j+1;
2446           }
2447         } else cmap[i] = NULL;
2448       }
2449 #endif
2450     }
2451 
2452     /* Create lens which is required for MatCreate... */
2453     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2454     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2455 
2456     if (ismax) {
2457       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2458     }
2459     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2460 
2461     /* Update lens from local data */
2462     for (i=0; i<ismax; i++) {
2463       row2proc_i = row2proc[i];
2464       jmax = nrow[i];
2465       if (!allcolumns[i]) cmap_i = cmap[i];
2466       irow_i = irow[i];
2467       lens_i = lens[i];
2468       for (j=0; j<jmax; j++) {
2469         row = irow_i[j];
2470         proc = row2proc_i[j];
2471         if (proc == rank) {
2472           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2473           if (!allcolumns[i]) {
2474             for (k=0; k<ncols; k++) {
2475 #if defined(PETSC_USE_CTABLE)
2476               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2477 #else
2478               tcol = cmap_i[cols[k]];
2479 #endif
2480               if (tcol) lens_i[j]++;
2481             }
2482           } else { /* allcolumns */
2483             lens_i[j] = ncols;
2484           }
2485           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2486         }
2487       }
2488     }
2489 
2490     /* Create row map: global row of C -> local row of submatrices */
2491 #if defined(PETSC_USE_CTABLE)
2492     for (i=0; i<ismax; i++) {
2493       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2494       irow_i = irow[i];
2495       jmax   = nrow[i];
2496       for (j=0; j<jmax; j++) {
2497       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2498       }
2499     }
2500 #else
2501     for (i=0; i<ismax; i++) {
2502       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2503       rmap_i = rmap[i];
2504       irow_i = irow[i];
2505       jmax   = nrow[i];
2506       for (j=0; j<jmax; j++) {
2507         rmap_i[irow_i[j]] = j;
2508       }
2509     }
2510 #endif
2511 
2512     /* Update lens from offproc data */
2513     {
2514       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2515 
2516       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2517       for (tmp2=0; tmp2<nrqs; tmp2++) {
2518         sbuf1_i = sbuf1[pa[tmp2]];
2519         jmax    = sbuf1_i[0];
2520         ct1     = 2*jmax+1;
2521         ct2     = 0;
2522         rbuf2_i = rbuf2[tmp2];
2523         rbuf3_i = rbuf3[tmp2];
2524         for (j=1; j<=jmax; j++) {
2525           is_no  = sbuf1_i[2*j-1];
2526           max1   = sbuf1_i[2*j];
2527           lens_i = lens[is_no];
2528           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2529           rmap_i = rmap[is_no];
2530           for (k=0; k<max1; k++,ct1++) {
2531 #if defined(PETSC_USE_CTABLE)
2532             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2533             row--;
2534             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2535 #else
2536             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2537 #endif
2538             max2 = rbuf2_i[ct1];
2539             for (l=0; l<max2; l++,ct2++) {
2540               if (!allcolumns[is_no]) {
2541 #if defined(PETSC_USE_CTABLE)
2542                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2543 #else
2544                 tcol = cmap_i[rbuf3_i[ct2]];
2545 #endif
2546                 if (tcol) lens_i[row]++;
2547               } else { /* allcolumns */
2548                 lens_i[row]++; /* lens_i[row] += max2 ? */
2549               }
2550             }
2551           }
2552         }
2553       }
2554     }
2555     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2556     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2557     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2558     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2559 
2560     /* Create the submatrices */
2561     for (i=0; i<ismax; i++) {
2562       PetscInt    rbs,cbs;
2563 
2564       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2565       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2566 
2567       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2568       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2569 
2570       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2571       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2572       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2573 
2574       /* create struct Mat_SubSppt and attached it to submat */
2575       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2576       subc = (Mat_SeqAIJ*)submats[i]->data;
2577       subc->submatis1 = smat_i;
2578 
2579       smat_i->destroy          = submats[i]->ops->destroy;
2580       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2581       submats[i]->factortype   = C->factortype;
2582 
2583       smat_i->id          = i;
2584       smat_i->nrqs        = nrqs;
2585       smat_i->nrqr        = nrqr;
2586       smat_i->rbuf1       = rbuf1;
2587       smat_i->rbuf2       = rbuf2;
2588       smat_i->rbuf3       = rbuf3;
2589       smat_i->sbuf2       = sbuf2;
2590       smat_i->req_source2 = req_source2;
2591 
2592       smat_i->sbuf1       = sbuf1;
2593       smat_i->ptr         = ptr;
2594       smat_i->tmp         = tmp;
2595       smat_i->ctr         = ctr;
2596 
2597       smat_i->pa           = pa;
2598       smat_i->req_size     = req_size;
2599       smat_i->req_source1  = req_source1;
2600 
2601       smat_i->allcolumns  = allcolumns[i];
2602       smat_i->singleis    = PETSC_FALSE;
2603       smat_i->row2proc    = row2proc[i];
2604       smat_i->rmap        = rmap[i];
2605       smat_i->cmap        = cmap[i];
2606     }
2607 
2608     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2609       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2610       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2611       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2612 
2613       /* create struct Mat_SubSppt and attached it to submat */
2614       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2615       submats[0]->data = (void*)smat_i;
2616 
2617       smat_i->destroy          = submats[0]->ops->destroy;
2618       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2619       submats[0]->factortype   = C->factortype;
2620 
2621       smat_i->id          = 0;
2622       smat_i->nrqs        = nrqs;
2623       smat_i->nrqr        = nrqr;
2624       smat_i->rbuf1       = rbuf1;
2625       smat_i->rbuf2       = rbuf2;
2626       smat_i->rbuf3       = rbuf3;
2627       smat_i->sbuf2       = sbuf2;
2628       smat_i->req_source2 = req_source2;
2629 
2630       smat_i->sbuf1       = sbuf1;
2631       smat_i->ptr         = ptr;
2632       smat_i->tmp         = tmp;
2633       smat_i->ctr         = ctr;
2634 
2635       smat_i->pa           = pa;
2636       smat_i->req_size     = req_size;
2637       smat_i->req_source1  = req_source1;
2638 
2639       smat_i->allcolumns  = PETSC_FALSE;
2640       smat_i->singleis    = PETSC_FALSE;
2641       smat_i->row2proc    = NULL;
2642       smat_i->rmap        = NULL;
2643       smat_i->cmap        = NULL;
2644     }
2645 
2646     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2647     ierr = PetscFree(lens);CHKERRQ(ierr);
2648     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2649     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2650 
2651   } /* endof scall == MAT_INITIAL_MATRIX */
2652 
2653   /* Post recv matrix values */
2654   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2655   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2656   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2657   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2658   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2659   for (i=0; i<nrqs; ++i) {
2660     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2661     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2662   }
2663 
2664   /* Allocate sending buffers for a->a, and send them off */
2665   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2666   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2667   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2668   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2669 
2670   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2671   {
2672     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2673     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2674     PetscInt    cend   = C->cmap->rend;
2675     PetscInt    *b_j   = b->j;
2676     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2677 
2678     for (i=0; i<nrqr; i++) {
2679       rbuf1_i   = rbuf1[i];
2680       sbuf_aa_i = sbuf_aa[i];
2681       ct1       = 2*rbuf1_i[0]+1;
2682       ct2       = 0;
2683       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2684         kmax = rbuf1_i[2*j];
2685         for (k=0; k<kmax; k++,ct1++) {
2686           row    = rbuf1_i[ct1] - rstart;
2687           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2688           ncols  = nzA + nzB;
2689           cworkB = b_j + b_i[row];
2690           vworkA = a_a + a_i[row];
2691           vworkB = b_a + b_i[row];
2692 
2693           /* load the column values for this row into vals*/
2694           vals = sbuf_aa_i+ct2;
2695 
2696           lwrite = 0;
2697           for (l=0; l<nzB; l++) {
2698             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2699           }
2700           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2701           for (l=0; l<nzB; l++) {
2702             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2703           }
2704 
2705           ct2 += ncols;
2706         }
2707       }
2708       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2709     }
2710   }
2711 
2712   /* Assemble the matrices */
2713   /* First assemble the local rows */
2714   for (i=0; i<ismax; i++) {
2715     row2proc_i = row2proc[i];
2716     subc      = (Mat_SeqAIJ*)submats[i]->data;
2717     imat_ilen = subc->ilen;
2718     imat_j    = subc->j;
2719     imat_i    = subc->i;
2720     imat_a    = subc->a;
2721 
2722     if (!allcolumns[i]) cmap_i = cmap[i];
2723     rmap_i = rmap[i];
2724     irow_i = irow[i];
2725     jmax   = nrow[i];
2726     for (j=0; j<jmax; j++) {
2727       row  = irow_i[j];
2728       proc = row2proc_i[j];
2729       if (proc == rank) {
2730         old_row = row;
2731 #if defined(PETSC_USE_CTABLE)
2732         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2733         row--;
2734 #else
2735         row = rmap_i[row];
2736 #endif
2737         ilen_row = imat_ilen[row];
2738         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2739         mat_i    = imat_i[row];
2740         mat_a    = imat_a + mat_i;
2741         mat_j    = imat_j + mat_i;
2742         if (!allcolumns[i]) {
2743           for (k=0; k<ncols; k++) {
2744 #if defined(PETSC_USE_CTABLE)
2745             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2746 #else
2747             tcol = cmap_i[cols[k]];
2748 #endif
2749             if (tcol) {
2750               *mat_j++ = tcol - 1;
2751               *mat_a++ = vals[k];
2752               ilen_row++;
2753             }
2754           }
2755         } else { /* allcolumns */
2756           for (k=0; k<ncols; k++) {
2757             *mat_j++ = cols[k];  /* global col index! */
2758             *mat_a++ = vals[k];
2759             ilen_row++;
2760           }
2761         }
2762         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2763 
2764         imat_ilen[row] = ilen_row;
2765       }
2766     }
2767   }
2768 
2769   /* Now assemble the off proc rows */
2770   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2771   for (tmp2=0; tmp2<nrqs; tmp2++) {
2772     sbuf1_i = sbuf1[pa[tmp2]];
2773     jmax    = sbuf1_i[0];
2774     ct1     = 2*jmax + 1;
2775     ct2     = 0;
2776     rbuf2_i = rbuf2[tmp2];
2777     rbuf3_i = rbuf3[tmp2];
2778     rbuf4_i = rbuf4[tmp2];
2779     for (j=1; j<=jmax; j++) {
2780       is_no     = sbuf1_i[2*j-1];
2781       rmap_i    = rmap[is_no];
2782       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2783       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2784       imat_ilen = subc->ilen;
2785       imat_j    = subc->j;
2786       imat_i    = subc->i;
2787       imat_a    = subc->a;
2788       max1      = sbuf1_i[2*j];
2789       for (k=0; k<max1; k++,ct1++) {
2790         row = sbuf1_i[ct1];
2791 #if defined(PETSC_USE_CTABLE)
2792         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2793         row--;
2794 #else
2795         row = rmap_i[row];
2796 #endif
2797         ilen  = imat_ilen[row];
2798         mat_i = imat_i[row];
2799         mat_a = imat_a + mat_i;
2800         mat_j = imat_j + mat_i;
2801         max2  = rbuf2_i[ct1];
2802         if (!allcolumns[is_no]) {
2803           for (l=0; l<max2; l++,ct2++) {
2804 #if defined(PETSC_USE_CTABLE)
2805             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2806 #else
2807             tcol = cmap_i[rbuf3_i[ct2]];
2808 #endif
2809             if (tcol) {
2810               *mat_j++ = tcol - 1;
2811               *mat_a++ = rbuf4_i[ct2];
2812               ilen++;
2813             }
2814           }
2815         } else { /* allcolumns */
2816           for (l=0; l<max2; l++,ct2++) {
2817             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2818             *mat_a++ = rbuf4_i[ct2];
2819             ilen++;
2820           }
2821         }
2822         imat_ilen[row] = ilen;
2823       }
2824     }
2825   }
2826 
2827   if (!iscsorted) { /* sort column indices of the rows */
2828     for (i=0; i<ismax; i++) {
2829       subc      = (Mat_SeqAIJ*)submats[i]->data;
2830       imat_j    = subc->j;
2831       imat_i    = subc->i;
2832       imat_a    = subc->a;
2833       imat_ilen = subc->ilen;
2834 
2835       if (allcolumns[i]) continue;
2836       jmax = nrow[i];
2837       for (j=0; j<jmax; j++) {
2838         mat_i = imat_i[j];
2839         mat_a = imat_a + mat_i;
2840         mat_j = imat_j + mat_i;
2841         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2842       }
2843     }
2844   }
2845 
2846   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2847   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2848   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2849   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2850   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2851 
2852   /* Restore the indices */
2853   for (i=0; i<ismax; i++) {
2854     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2855     if (!allcolumns[i]) {
2856       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2857     }
2858   }
2859 
2860   for (i=0; i<ismax; i++) {
2861     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2862     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2863   }
2864 
2865   /* Destroy allocated memory */
2866   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2867   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2868   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2869 
2870   for (i=0; i<nrqs; ++i) {
2871     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2872   }
2873   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2874 
2875   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2876   PetscFunctionReturn(0);
2877 }
2878 
2879 /*
2880  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2881  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2882  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2883  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2884  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2885  state, and needs to be "assembled" later by compressing B's column space.
2886 
2887  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2888  Following this call, C->A & C->B have been created, even if empty.
2889  */
2890 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2891 {
2892   /* If making this function public, change the error returned in this function away from _PLIB. */
2893   PetscErrorCode ierr;
2894   Mat_MPIAIJ     *aij;
2895   Mat_SeqAIJ     *Baij;
2896   PetscBool      seqaij,Bdisassembled;
2897   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2898   PetscScalar    v;
2899   const PetscInt *rowindices,*colindices;
2900 
2901   PetscFunctionBegin;
2902   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2903   if (A) {
2904     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2905     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2906     if (rowemb) {
2907       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2908       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2909     } else {
2910       if (C->rmap->n != A->rmap->n) {
2911         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2912       }
2913     }
2914     if (dcolemb) {
2915       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2916       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2917     } else {
2918       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2919     }
2920   }
2921   if (B) {
2922     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2923     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2924     if (rowemb) {
2925       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2926       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2927     } else {
2928       if (C->rmap->n != B->rmap->n) {
2929         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2930       }
2931     }
2932     if (ocolemb) {
2933       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2934       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2935     } else {
2936       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2937     }
2938   }
2939 
2940   aij = (Mat_MPIAIJ*)C->data;
2941   if (!aij->A) {
2942     /* Mimic parts of MatMPIAIJSetPreallocation() */
2943     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2944     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2945     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2946     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2947     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2948   }
2949   if (A) {
2950     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2951   } else {
2952     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2953   }
2954   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2955     /*
2956       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2957       need to "disassemble" B -- convert it to using C's global indices.
2958       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2959 
2960       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2961       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2962 
2963       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2964       At least avoid calling MatSetValues() and the implied searches?
2965     */
2966 
2967     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2968 #if defined(PETSC_USE_CTABLE)
2969       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2970 #else
2971       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2972       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2973       if (aij->B) {
2974         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2975       }
2976 #endif
2977       ngcol = 0;
2978       if (aij->lvec) {
2979         ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2980       }
2981       if (aij->garray) {
2982         ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2983         ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2984       }
2985       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2986       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2987     }
2988     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2989       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2990     }
2991     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2992       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2993     }
2994   }
2995   Bdisassembled = PETSC_FALSE;
2996   if (!aij->B) {
2997     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2998     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2999     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3000     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3001     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3002     Bdisassembled = PETSC_TRUE;
3003   }
3004   if (B) {
3005     Baij = (Mat_SeqAIJ*)B->data;
3006     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3007       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3008       for (i=0; i<B->rmap->n; i++) {
3009         nz[i] = Baij->i[i+1] - Baij->i[i];
3010       }
3011       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3012       ierr = PetscFree(nz);CHKERRQ(ierr);
3013     }
3014 
3015     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3016     shift = rend-rstart;
3017     count = 0;
3018     rowindices = NULL;
3019     colindices = NULL;
3020     if (rowemb) {
3021       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3022     }
3023     if (ocolemb) {
3024       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3025     }
3026     for (i=0; i<B->rmap->n; i++) {
3027       PetscInt row;
3028       row = i;
3029       if (rowindices) row = rowindices[i];
3030       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3031         col  = Baij->j[count];
3032         if (colindices) col = colindices[col];
3033         if (Bdisassembled && col>=rstart) col += shift;
3034         v    = Baij->a[count];
3035         ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3036         ++count;
3037       }
3038     }
3039     /* No assembly for aij->B is necessary. */
3040     /* FIXME: set aij->B's nonzerostate correctly. */
3041   } else {
3042     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3043   }
3044   C->preallocated  = PETSC_TRUE;
3045   C->was_assembled = PETSC_FALSE;
3046   C->assembled     = PETSC_FALSE;
3047    /*
3048       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3049       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3050    */
3051   PetscFunctionReturn(0);
3052 }
3053 
3054 /*
3055   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3056  */
3057 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3058 {
3059   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3060 
3061   PetscFunctionBegin;
3062   PetscValidPointer(A,2);
3063   PetscValidPointer(B,3);
3064   /* FIXME: make sure C is assembled */
3065   *A = aij->A;
3066   *B = aij->B;
3067   /* Note that we don't incref *A and *B, so be careful! */
3068   PetscFunctionReturn(0);
3069 }
3070 
3071 /*
3072   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3073   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3074 */
3075 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3076                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3077                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3078                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3079                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3080 {
3081   PetscErrorCode ierr;
3082   PetscMPIInt    size,flag;
3083   PetscInt       i,ii,cismax,ispar;
3084   Mat            *A,*B;
3085   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3086 
3087   PetscFunctionBegin;
3088   if (!ismax) PetscFunctionReturn(0);
3089 
3090   for (i = 0, cismax = 0; i < ismax; ++i) {
3091     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3092     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3093     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
3094     if (size > 1) ++cismax;
3095   }
3096 
3097   /*
3098      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3099      ispar counts the number of parallel ISs across C's comm.
3100   */
3101   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3102   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3103     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3104     PetscFunctionReturn(0);
3105   }
3106 
3107   /* if (ispar) */
3108   /*
3109     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3110     These are used to extract the off-diag portion of the resulting parallel matrix.
3111     The row IS for the off-diag portion is the same as for the diag portion,
3112     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3113   */
3114   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3115   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3116   for (i = 0, ii = 0; i < ismax; ++i) {
3117     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRQ(ierr);
3118     if (size > 1) {
3119       /*
3120          TODO: This is the part that's ***NOT SCALABLE***.
3121          To fix this we need to extract just the indices of C's nonzero columns
3122          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3123          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3124          to be done without serializing on the IS list, so, most likely, it is best
3125          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3126       */
3127       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3128       /* Now we have to
3129          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3130              were sorted on each rank, concatenated they might no longer be sorted;
3131          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3132              indices in the nondecreasing order to the original index positions.
3133          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3134       */
3135       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3136       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3137       ++ii;
3138     }
3139   }
3140   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3141   for (i = 0, ii = 0; i < ismax; ++i) {
3142     PetscInt       j,issize;
3143     const PetscInt *indices;
3144 
3145     /*
3146        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3147      */
3148     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3149     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3150     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3151     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3152     for (j = 1; j < issize; ++j) {
3153       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3154     }
3155     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3156     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3157     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3158     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3159     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3160     for (j = 1; j < issize; ++j) {
3161       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3162     }
3163     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3164     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRQ(ierr);
3165     if (size > 1) {
3166       cisrow[ii] = isrow[i];
3167       ++ii;
3168     }
3169   }
3170   /*
3171     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3172     array of sequential matrices underlying the resulting parallel matrices.
3173     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3174     contain duplicates.
3175 
3176     There are as many diag matrices as there are original index sets. There are only as many parallel
3177     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3178 
3179     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3180     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3181       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3182       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3183       with A[i] and B[ii] extracted from the corresponding MPI submat.
3184     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3185       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3186       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3187       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3188       values into A[i] and B[ii] sitting inside the corresponding submat.
3189     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3190       A[i], B[ii], AA[i] or BB[ii] matrices.
3191   */
3192   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3193   if (scall == MAT_INITIAL_MATRIX) {
3194     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3195   }
3196 
3197   /* Now obtain the sequential A and B submatrices separately. */
3198   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3199   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3200   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3201 
3202   /*
3203     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3204     matrices A & B have been extracted directly into the parallel matrices containing them, or
3205     simply into the sequential matrix identical with the corresponding A (if size == 1).
3206     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3207     to have the same sparsity pattern.
3208     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3209     must be constructed for C. This is done by setseqmat(s).
3210   */
3211   for (i = 0, ii = 0; i < ismax; ++i) {
3212     /*
3213        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3214        That way we can avoid sorting and computing permutations when reusing.
3215        To this end:
3216         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3217         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3218           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3219     */
3220     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3221 
3222     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRQ(ierr);
3223     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3224     if (size > 1) {
3225       if (scall == MAT_INITIAL_MATRIX) {
3226         ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3227         ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3228         ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3229         ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3230         ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3231       }
3232       /*
3233         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3234       */
3235       {
3236         Mat AA = A[i],BB = B[ii];
3237 
3238         if (AA || BB) {
3239           ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3240           ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3241           ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3242         }
3243         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3244       }
3245       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3246       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3247       ++ii;
3248     } else { /* if (size == 1) */
3249       if (scall == MAT_REUSE_MATRIX) {
3250         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3251       }
3252       if (isrow_p[i] || iscol_p[i]) {
3253         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3254         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3255         /* Otherwise A is extracted straight into (*submats)[i]. */
3256         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3257         ierr = MatDestroy(A+i);CHKERRQ(ierr);
3258       } else (*submat)[i] = A[i];
3259     }
3260     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3261     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3262   }
3263   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3264   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3265   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3266   ierr = PetscFree(A);CHKERRQ(ierr);
3267   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3268   PetscFunctionReturn(0);
3269 }
3270 
3271 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3272 {
3273   PetscErrorCode ierr;
3274 
3275   PetscFunctionBegin;
3276   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3277   PetscFunctionReturn(0);
3278 }
3279