xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 6e5f5dad97d5d0d453d1db100f9dbfcf5747de2b)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++) {
82     length_i  = length[i];
83     indices_i = indices[i];
84     for (j=0; j<length_i; j++) {
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank) {
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows) {
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++) {
115     if (tosizes_temp[i]) {
116       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117       tosizes_temp[i] = nto; /* a map from processor to index */
118       toranks[nto++]  = i; /* processor */
119     }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++) {
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++) {
133     for (j=0; j<fromsizes[2*i]; j++) {
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++) {
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++) {
169     for (j=0; j<tosizes[2*i]; j++) {
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   MPI_Comm         *iscomms;
198   PetscErrorCode    ierr;
199 
200   PetscFunctionBegin;
201   max_lsize = 0;
202   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
203   for (i=0; i<nidx; i++){
204     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
205     max_lsize = lsize>max_lsize ? lsize:max_lsize;
206     isz[i]    = lsize;
207   }
208   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
209   for (i=0; i<nidx; i++){
210     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
211     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
213     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
214     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
215   }
216   /* retrieve information to get row id and its overlap */
217   for (i=0; i<nrecvs; ){
218     is_id     = recvdata[i++];
219     data_size = recvdata[i++];
220     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
221     isz_i     = isz[is_id];
222     for (j=0; j< data_size; j++){
223       col = recvdata[i++];
224       indices_i[isz_i++] = col;
225     }
226     isz[is_id] = isz_i;
227   }
228   /* remove duplicate entities */
229   for (i=0; i<nidx; i++){
230     indices_i = indices_temp+(max_lsize+nrecvs)*i;
231     isz_i     = isz[i];
232     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
233     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
234     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
235   }
236   ierr = PetscFree(isz);CHKERRQ(ierr);
237   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
242 {
243   PetscLayout       rmap,cmap;
244   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
245   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
246   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
247   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
248   Mat               amat,bmat;
249   PetscMPIInt       rank;
250   PetscBool         done;
251   MPI_Comm          comm;
252   PetscErrorCode    ierr;
253 
254   PetscFunctionBegin;
255   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
256   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
257   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
258   /* Even if the mat is symmetric, we still assume it is not symmetric */
259   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
260   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
261   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
262   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
263   /* total number of nonzero values is used to estimate the memory usage in the next step */
264   tnz  = ai[an]+bi[bn];
265   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
267   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
268   /* to find the longest message */
269   max_fszs = 0;
270   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
271   /* better way to estimate number of nonzero in the mat??? */
272   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
273   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
274   rows_pos  = 0;
275   totalrows = 0;
276   for (i=0; i<nfrom; i++){
277     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
278     /* group data together */
279     for (j=0; j<fromsizes[2*i]; j+=2){
280       is_id                       = fromrows[rows_pos++];/* no of is */
281       rows_i                      = rows_data[is_id];
282       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
283     }
284     /* estimate a space to avoid multiple allocations  */
285     for (j=0; j<nidx; j++){
286       indvc_ij = 0;
287       rows_i   = rows_data[j];
288       for (l=0; l<rows_pos_i[j]; l++){
289         row    = rows_i[l]-rstart;
290         start  = ai[row];
291         end    = ai[row+1];
292         for (k=start; k<end; k++){ /* Amat */
293           col = aj[k] + cstart;
294           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
295         }
296         start = bi[row];
297         end   = bi[row+1];
298         for (k=start; k<end; k++) { /* Bmat */
299           col = gcols[bj[k]];
300           indices_tmp[indvc_ij++] = col;
301         }
302       }
303       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
304       indv_counts[i*nidx+j] = indvc_ij;
305       totalrows            += indvc_ij;
306     }
307   }
308   /* message triple <no of is, number of rows, rows> */
309   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
310   totalrows = 0;
311   rows_pos  = 0;
312   /* use this code again */
313   for (i=0;i<nfrom;i++){
314     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
315     for (j=0; j<fromsizes[2*i]; j+=2){
316       is_id                       = fromrows[rows_pos++];
317       rows_i                      = rows_data[is_id];
318       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
319     }
320     /* add data  */
321     for (j=0; j<nidx; j++){
322       if (!indv_counts[i*nidx+j]) continue;
323       indvc_ij = 0;
324       sbdata[totalrows++] = j;
325       sbdata[totalrows++] = indv_counts[i*nidx+j];
326       sbsizes[2*i]       += 2;
327       rows_i              = rows_data[j];
328       for (l=0; l<rows_pos_i[j]; l++){
329         row   = rows_i[l]-rstart;
330         start = ai[row];
331         end   = ai[row+1];
332         for (k=start; k<end; k++){ /* Amat */
333           col = aj[k] + cstart;
334           indices_tmp[indvc_ij++] = col;
335         }
336         start = bi[row];
337         end   = bi[row+1];
338         for (k=start; k<end; k++) { /* Bmat */
339           col = gcols[bj[k]];
340           indices_tmp[indvc_ij++] = col;
341         }
342       }
343       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
344       sbsizes[2*i]  += indvc_ij;
345       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
346       totalrows += indvc_ij;
347     }
348   }
349   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
350   for (i=0; i<nfrom; i++){
351     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352     sbsizes[2*i+1] = offsets[i];
353   }
354   ierr = PetscFree(offsets);CHKERRQ(ierr);
355   if (sbrowsizes) *sbrowsizes = sbsizes;
356   if (sbrows) *sbrows = sbdata;
357   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
359   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364 {
365   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367   PetscInt          lsize,lsize_tmp,owner;
368   PetscMPIInt       rank;
369   Mat               amat,bmat;
370   PetscBool         done;
371   PetscLayout       cmap,rmap;
372   MPI_Comm          comm;
373   PetscErrorCode    ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
377   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
378   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
379   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
380   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   /* is it a safe way to compute number of nonzero values ? */
384   tnz  = ai[an]+bi[bn];
385   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
387   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
388   /* it is a better way to estimate memory than the old implementation
389    * where global size of matrix is used
390    * */
391   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
392   for (i=0; i<nidx; i++) {
393     MPI_Comm iscomm;
394 
395     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
396     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
397     lsize_tmp = 0;
398     for (j=0; j<lsize; j++) {
399       owner = -1;
400       row   = indices[j];
401       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
402       if (owner != rank) continue;
403       /* local number */
404       row  -= rstart;
405       start = ai[row];
406       end   = ai[row+1];
407       for (k=start; k<end; k++) { /* Amat */
408         col = aj[k] + cstart;
409         indices_temp[lsize_tmp++] = col;
410       }
411       start = bi[row];
412       end   = bi[row+1];
413       for (k=start; k<end; k++) { /* Bmat */
414         col = gcols[bj[k]];
415         indices_temp[lsize_tmp++] = col;
416       }
417     }
418    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
419    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
420    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
421    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
422    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
423    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
424   }
425   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
426   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
427   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 
432 /*
433   Sample message format:
434   If a processor A wants processor B to process some elements corresponding
435   to index sets is[1],is[5]
436   mesg [0] = 2   (no of index sets in the mesg)
437   -----------
438   mesg [1] = 1 => is[1]
439   mesg [2] = sizeof(is[1]);
440   -----------
441   mesg [3] = 5  => is[5]
442   mesg [4] = sizeof(is[5]);
443   -----------
444   mesg [5]
445   mesg [n]  datas[1]
446   -----------
447   mesg[n+1]
448   mesg[m]  data(is[5])
449   -----------
450 
451   Notes:
452   nrqs - no of requests sent (or to be sent out)
453   nrqr - no of requests recieved (which have to be or which have been processed
454 */
455 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
456 {
457   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
458   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
459   const PetscInt **idx,*idx_i;
460   PetscInt       *n,**data,len;
461 #if defined(PETSC_USE_CTABLE)
462   PetscTable     *table_data,table_data_i;
463   PetscInt       *tdata,tcount,tcount_max;
464 #else
465   PetscInt       *data_i,*d_p;
466 #endif
467   PetscErrorCode ierr;
468   PetscMPIInt    size,rank,tag1,tag2;
469   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
470   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
471   PetscBT        *table;
472   MPI_Comm       comm;
473   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
474   MPI_Status     *s_status,*recv_status;
475   MPI_Comm       *iscomms;
476   char           *t_p;
477 
478   PetscFunctionBegin;
479   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
480   size = c->size;
481   rank = c->rank;
482   M    = C->rmap->N;
483 
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
485   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
486 
487   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
488 
489   for (i=0; i<imax; i++) {
490     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
491     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
492   }
493 
494   /* evaluate communication - mesg to who,length of mesg, and buffer space
495      required. Based on this, buffers are allocated, and data copied into them  */
496   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
497   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
498   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
499   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
500   for (i=0; i<imax; i++) {
501     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
502     idx_i = idx[i];
503     len   = n[i];
504     for (j=0; j<len; j++) {
505       row = idx_i[j];
506       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
507       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
508       w4[proc]++;
509     }
510     for (j=0; j<size; j++) {
511       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
512     }
513   }
514 
515   nrqs     = 0;              /* no of outgoing messages */
516   msz      = 0;              /* total mesg length (for all proc */
517   w1[rank] = 0;              /* no mesg sent to intself */
518   w3[rank] = 0;
519   for (i=0; i<size; i++) {
520     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
521   }
522   /* pa - is list of processors to communicate with */
523   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
524   for (i=0,j=0; i<size; i++) {
525     if (w1[i]) {pa[j] = i; j++;}
526   }
527 
528   /* Each message would have a header = 1 + 2*(no of IS) + data */
529   for (i=0; i<nrqs; i++) {
530     j      = pa[i];
531     w1[j] += w2[j] + 2*w3[j];
532     msz   += w1[j];
533   }
534 
535   /* Determine the number of messages to expect, their lengths, from from-ids */
536   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
537   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
538 
539   /* Now post the Irecvs corresponding to these messages */
540   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
541 
542   /* Allocate Memory for outgoing messages */
543   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
544   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
545   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
546 
547   {
548     PetscInt *iptr = tmp,ict  = 0;
549     for (i=0; i<nrqs; i++) {
550       j         = pa[i];
551       iptr     +=  ict;
552       outdat[j] = iptr;
553       ict       = w1[j];
554     }
555   }
556 
557   /* Form the outgoing messages */
558   /* plug in the headers */
559   for (i=0; i<nrqs; i++) {
560     j            = pa[i];
561     outdat[j][0] = 0;
562     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
563     ptr[j]       = outdat[j] + 2*w3[j] + 1;
564   }
565 
566   /* Memory for doing local proc's work */
567   {
568     PetscInt M_BPB_imax = 0;
569 #if defined(PETSC_USE_CTABLE)
570     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
571     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
572     for (i=0; i<imax; i++) {
573       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
574     }
575     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578     }
579 #else
580     PetscInt Mimax = 0;
581     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
582     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
583     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
584     for (i=0; i<imax; i++) {
585       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
586       data[i]  = d_p + M*i;
587     }
588 #endif
589   }
590 
591   /* Parse the IS and update local tables and the outgoing buf with the data */
592   {
593     PetscInt n_i,isz_i,*outdat_j,ctr_j;
594     PetscBT  table_i;
595 
596     for (i=0; i<imax; i++) {
597       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
598       n_i     = n[i];
599       table_i = table[i];
600       idx_i   = idx[i];
601 #if defined(PETSC_USE_CTABLE)
602       table_data_i = table_data[i];
603 #else
604       data_i  = data[i];
605 #endif
606       isz_i   = isz[i];
607       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
608         row  = idx_i[j];
609         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
610         if (proc != rank) { /* copy to the outgoing buffer */
611           ctr[proc]++;
612           *ptr[proc] = row;
613           ptr[proc]++;
614         } else if (!PetscBTLookupSet(table_i,row)) {
615 #if defined(PETSC_USE_CTABLE)
616           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
617 #else
618           data_i[isz_i] = row; /* Update the local table */
619 #endif
620           isz_i++;
621         }
622       }
623       /* Update the headers for the current IS */
624       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
625         if ((ctr_j = ctr[j])) {
626           outdat_j        = outdat[j];
627           k               = ++outdat_j[0];
628           outdat_j[2*k]   = ctr_j;
629           outdat_j[2*k-1] = i;
630         }
631       }
632       isz[i] = isz_i;
633     }
634   }
635 
636   /*  Now  post the sends */
637   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
638   for (i=0; i<nrqs; ++i) {
639     j    = pa[i];
640     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
641   }
642 
643   /* No longer need the original indices */
644   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
645   for (i=0; i<imax; ++i) {
646     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
647     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
648   }
649   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
650 
651   for (i=0; i<imax; ++i) {
652     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
653   }
654 
655   /* Do Local work */
656 #if defined(PETSC_USE_CTABLE)
657   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
658 #else
659   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
660 #endif
661 
662   /* Receive messages */
663   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
664   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
665 
666   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
667   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
668 
669   /* Phase 1 sends are complete - deallocate buffers */
670   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
671   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
672 
673   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
674   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
675   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
676   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
677   ierr = PetscFree(rbuf);CHKERRQ(ierr);
678 
679 
680   /* Send the data back */
681   /* Do a global reduction to know the buffer space req for incoming messages */
682   {
683     PetscMPIInt *rw1;
684 
685     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
686 
687     for (i=0; i<nrqr; ++i) {
688       proc = recv_status[i].MPI_SOURCE;
689 
690       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
691       rw1[proc] = isz1[i];
692     }
693     ierr = PetscFree(onodes1);CHKERRQ(ierr);
694     ierr = PetscFree(olengths1);CHKERRQ(ierr);
695 
696     /* Determine the number of messages to expect, their lengths, from from-ids */
697     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
698     ierr = PetscFree(rw1);CHKERRQ(ierr);
699   }
700   /* Now post the Irecvs corresponding to these messages */
701   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
702 
703   /* Now  post the sends */
704   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
705   for (i=0; i<nrqr; ++i) {
706     j    = recv_status[i].MPI_SOURCE;
707     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
708   }
709 
710   /* receive work done on other processors */
711   {
712     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
713     PetscMPIInt idex;
714     PetscBT     table_i;
715     MPI_Status  *status2;
716 
717     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
718     for (i=0; i<nrqs; ++i) {
719       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
720       /* Process the message */
721       rbuf2_i = rbuf2[idex];
722       ct1     = 2*rbuf2_i[0]+1;
723       jmax    = rbuf2[idex][0];
724       for (j=1; j<=jmax; j++) {
725         max     = rbuf2_i[2*j];
726         is_no   = rbuf2_i[2*j-1];
727         isz_i   = isz[is_no];
728         table_i = table[is_no];
729 #if defined(PETSC_USE_CTABLE)
730         table_data_i = table_data[is_no];
731 #else
732         data_i  = data[is_no];
733 #endif
734         for (k=0; k<max; k++,ct1++) {
735           row = rbuf2_i[ct1];
736           if (!PetscBTLookupSet(table_i,row)) {
737 #if defined(PETSC_USE_CTABLE)
738             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
739 #else
740             data_i[isz_i] = row;
741 #endif
742             isz_i++;
743           }
744         }
745         isz[is_no] = isz_i;
746       }
747     }
748 
749     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
750     ierr = PetscFree(status2);CHKERRQ(ierr);
751   }
752 
753 #if defined(PETSC_USE_CTABLE)
754   tcount_max = 0;
755   for (i=0; i<imax; ++i) {
756     table_data_i = table_data[i];
757     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
758     if (tcount_max < tcount) tcount_max = tcount;
759   }
760   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
761 #endif
762 
763   for (i=0; i<imax; ++i) {
764 #if defined(PETSC_USE_CTABLE)
765     PetscTablePosition tpos;
766     table_data_i = table_data[i];
767 
768     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
769     while (tpos) {
770       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
771       tdata[--j] = --k;
772     }
773     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
774 #else
775     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
776 #endif
777     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
778   }
779 
780   ierr = PetscFree(iscomms);CHKERRQ(ierr);
781   ierr = PetscFree(onodes2);CHKERRQ(ierr);
782   ierr = PetscFree(olengths2);CHKERRQ(ierr);
783 
784   ierr = PetscFree(pa);CHKERRQ(ierr);
785   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
786   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
787   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
788   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
789   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
790   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
791   ierr = PetscFree(s_status);CHKERRQ(ierr);
792   ierr = PetscFree(recv_status);CHKERRQ(ierr);
793   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
794   ierr = PetscFree(xdata);CHKERRQ(ierr);
795   ierr = PetscFree(isz1);CHKERRQ(ierr);
796 #if defined(PETSC_USE_CTABLE)
797   for (i=0; i<imax; i++) {
798     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
799   }
800   ierr = PetscFree(table_data);CHKERRQ(ierr);
801   ierr = PetscFree(tdata);CHKERRQ(ierr);
802   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
803 #else
804   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
805 #endif
806   PetscFunctionReturn(0);
807 }
808 
809 /*
810    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
811        the work on the local processor.
812 
813      Inputs:
814       C      - MAT_MPIAIJ;
815       imax - total no of index sets processed at a time;
816       table  - an array of char - size = m bits.
817 
818      Output:
819       isz    - array containing the count of the solution elements corresponding
820                to each index set;
821       data or table_data  - pointer to the solutions
822 */
823 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
824 {
825   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
826   Mat        A  = c->A,B = c->B;
827   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
828   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
829   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
830   PetscBT    table_i;
831 #if defined(PETSC_USE_CTABLE)
832   PetscTable         table_data_i;
833   PetscErrorCode     ierr;
834   PetscTablePosition tpos;
835   PetscInt           tcount,*tdata;
836 #else
837   PetscInt           *data_i;
838 #endif
839 
840   PetscFunctionBegin;
841   rstart = C->rmap->rstart;
842   cstart = C->cmap->rstart;
843   ai     = a->i;
844   aj     = a->j;
845   bi     = b->i;
846   bj     = b->j;
847   garray = c->garray;
848 
849   for (i=0; i<imax; i++) {
850 #if defined(PETSC_USE_CTABLE)
851     /* copy existing entries of table_data_i into tdata[] */
852     table_data_i = table_data[i];
853     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
854     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
855 
856     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
857     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
858     while (tpos) {
859       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
860       tdata[--j] = --row;
861       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
862     }
863 #else
864     data_i  = data[i];
865 #endif
866     table_i = table[i];
867     isz_i   = isz[i];
868     max     = isz[i];
869 
870     for (j=0; j<max; j++) {
871 #if defined(PETSC_USE_CTABLE)
872       row   = tdata[j] - rstart;
873 #else
874       row   = data_i[j] - rstart;
875 #endif
876       start = ai[row];
877       end   = ai[row+1];
878       for (k=start; k<end; k++) { /* Amat */
879         val = aj[k] + cstart;
880         if (!PetscBTLookupSet(table_i,val)) {
881 #if defined(PETSC_USE_CTABLE)
882           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
883 #else
884           data_i[isz_i] = val;
885 #endif
886           isz_i++;
887         }
888       }
889       start = bi[row];
890       end   = bi[row+1];
891       for (k=start; k<end; k++) { /* Bmat */
892         val = garray[bj[k]];
893         if (!PetscBTLookupSet(table_i,val)) {
894 #if defined(PETSC_USE_CTABLE)
895           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
896 #else
897           data_i[isz_i] = val;
898 #endif
899           isz_i++;
900         }
901       }
902     }
903     isz[i] = isz_i;
904 
905 #if defined(PETSC_USE_CTABLE)
906     ierr = PetscFree(tdata);CHKERRQ(ierr);
907 #endif
908   }
909   PetscFunctionReturn(0);
910 }
911 
912 /*
913       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
914          and return the output
915 
916          Input:
917            C    - the matrix
918            nrqr - no of messages being processed.
919            rbuf - an array of pointers to the recieved requests
920 
921          Output:
922            xdata - array of messages to be sent back
923            isz1  - size of each message
924 
925   For better efficiency perhaps we should malloc separately each xdata[i],
926 then if a remalloc is required we need only copy the data for that one row
927 rather then all previous rows as it is now where a single large chunck of
928 memory is used.
929 
930 */
931 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
932 {
933   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
934   Mat            A  = c->A,B = c->B;
935   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
936   PetscErrorCode ierr;
937   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
938   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
939   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
940   PetscInt       *rbuf_i,kmax,rbuf_0;
941   PetscBT        xtable;
942 
943   PetscFunctionBegin;
944   m      = C->rmap->N;
945   rstart = C->rmap->rstart;
946   cstart = C->cmap->rstart;
947   ai     = a->i;
948   aj     = a->j;
949   bi     = b->i;
950   bj     = b->j;
951   garray = c->garray;
952 
953 
954   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
955     rbuf_i =  rbuf[i];
956     rbuf_0 =  rbuf_i[0];
957     ct    += rbuf_0;
958     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
959   }
960 
961   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
962   else max1 = 1;
963   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
964   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
965   ++no_malloc;
966   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
967   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
968 
969   ct3 = 0;
970   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
971     rbuf_i =  rbuf[i];
972     rbuf_0 =  rbuf_i[0];
973     ct1    =  2*rbuf_0+1;
974     ct2    =  ct1;
975     ct3   += ct1;
976     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
977       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
978       oct2 = ct2;
979       kmax = rbuf_i[2*j];
980       for (k=0; k<kmax; k++,ct1++) {
981         row = rbuf_i[ct1];
982         if (!PetscBTLookupSet(xtable,row)) {
983           if (!(ct3 < mem_estimate)) {
984             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
985             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
986             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
987             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
988             xdata[0]     = tmp;
989             mem_estimate = new_estimate; ++no_malloc;
990             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
991           }
992           xdata[i][ct2++] = row;
993           ct3++;
994         }
995       }
996       for (k=oct2,max2=ct2; k<max2; k++) {
997         row   = xdata[i][k] - rstart;
998         start = ai[row];
999         end   = ai[row+1];
1000         for (l=start; l<end; l++) {
1001           val = aj[l] + cstart;
1002           if (!PetscBTLookupSet(xtable,val)) {
1003             if (!(ct3 < mem_estimate)) {
1004               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1005               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1006               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1007               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1008               xdata[0]     = tmp;
1009               mem_estimate = new_estimate; ++no_malloc;
1010               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1011             }
1012             xdata[i][ct2++] = val;
1013             ct3++;
1014           }
1015         }
1016         start = bi[row];
1017         end   = bi[row+1];
1018         for (l=start; l<end; l++) {
1019           val = garray[bj[l]];
1020           if (!PetscBTLookupSet(xtable,val)) {
1021             if (!(ct3 < mem_estimate)) {
1022               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1023               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1024               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1025               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1026               xdata[0]     = tmp;
1027               mem_estimate = new_estimate; ++no_malloc;
1028               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1029             }
1030             xdata[i][ct2++] = val;
1031             ct3++;
1032           }
1033         }
1034       }
1035       /* Update the header*/
1036       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1037       xdata[i][2*j-1] = rbuf_i[2*j-1];
1038     }
1039     xdata[i][0] = rbuf_0;
1040     xdata[i+1]  = xdata[i] + ct2;
1041     isz1[i]     = ct2; /* size of each message */
1042   }
1043   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1044   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1045   PetscFunctionReturn(0);
1046 }
1047 /* -------------------------------------------------------------------------*/
1048 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1049 /*
1050     Every processor gets the entire matrix
1051 */
1052 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1053 {
1054   Mat            B;
1055   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1056   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1057   PetscErrorCode ierr;
1058   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1059   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1060   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1061   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1062 
1063   PetscFunctionBegin;
1064   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1065   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1066 
1067   if (scall == MAT_INITIAL_MATRIX) {
1068     /* ----------------------------------------------------------------
1069          Tell every processor the number of nonzeros per row
1070     */
1071     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1072     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1073       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1074     }
1075     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1076     for (i=0; i<size; i++) {
1077       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1078       displs[i]     = A->rmap->range[i];
1079     }
1080 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1081     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1082 #else
1083     sendcount = A->rmap->rend - A->rmap->rstart;
1084     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1085 #endif
1086     /* ---------------------------------------------------------------
1087          Create the sequential matrix of the same type as the local block diagonal
1088     */
1089     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1090     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1091     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1092     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1093     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1094     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1095     **Bin = B;
1096     b     = (Mat_SeqAIJ*)B->data;
1097 
1098     /*--------------------------------------------------------------------
1099        Copy my part of matrix column indices over
1100     */
1101     sendcount  = ad->nz + bd->nz;
1102     jsendbuf   = b->j + b->i[rstarts[rank]];
1103     a_jsendbuf = ad->j;
1104     b_jsendbuf = bd->j;
1105     n          = A->rmap->rend - A->rmap->rstart;
1106     cnt        = 0;
1107     for (i=0; i<n; i++) {
1108 
1109       /* put in lower diagonal portion */
1110       m = bd->i[i+1] - bd->i[i];
1111       while (m > 0) {
1112         /* is it above diagonal (in bd (compressed) numbering) */
1113         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115         m--;
1116       }
1117 
1118       /* put in diagonal portion */
1119       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1120         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1121       }
1122 
1123       /* put in upper diagonal portion */
1124       while (m-- > 0) {
1125         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1126       }
1127     }
1128     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1129 
1130     /*--------------------------------------------------------------------
1131        Gather all column indices to all processors
1132     */
1133     for (i=0; i<size; i++) {
1134       recvcounts[i] = 0;
1135       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1136         recvcounts[i] += lens[j];
1137       }
1138     }
1139     displs[0] = 0;
1140     for (i=1; i<size; i++) {
1141       displs[i] = displs[i-1] + recvcounts[i-1];
1142     }
1143 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1144     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1145 #else
1146     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1147 #endif
1148     /*--------------------------------------------------------------------
1149         Assemble the matrix into useable form (note numerical values not yet set)
1150     */
1151     /* set the b->ilen (length of each row) values */
1152     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1153     /* set the b->i indices */
1154     b->i[0] = 0;
1155     for (i=1; i<=A->rmap->N; i++) {
1156       b->i[i] = b->i[i-1] + lens[i-1];
1157     }
1158     ierr = PetscFree(lens);CHKERRQ(ierr);
1159     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1160     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1161 
1162   } else {
1163     B = **Bin;
1164     b = (Mat_SeqAIJ*)B->data;
1165   }
1166 
1167   /*--------------------------------------------------------------------
1168        Copy my part of matrix numerical values into the values location
1169   */
1170   if (flag == MAT_GET_VALUES) {
1171     sendcount = ad->nz + bd->nz;
1172     sendbuf   = b->a + b->i[rstarts[rank]];
1173     a_sendbuf = ad->a;
1174     b_sendbuf = bd->a;
1175     b_sendj   = bd->j;
1176     n         = A->rmap->rend - A->rmap->rstart;
1177     cnt       = 0;
1178     for (i=0; i<n; i++) {
1179 
1180       /* put in lower diagonal portion */
1181       m = bd->i[i+1] - bd->i[i];
1182       while (m > 0) {
1183         /* is it above diagonal (in bd (compressed) numbering) */
1184         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1185         sendbuf[cnt++] = *b_sendbuf++;
1186         m--;
1187         b_sendj++;
1188       }
1189 
1190       /* put in diagonal portion */
1191       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1192         sendbuf[cnt++] = *a_sendbuf++;
1193       }
1194 
1195       /* put in upper diagonal portion */
1196       while (m-- > 0) {
1197         sendbuf[cnt++] = *b_sendbuf++;
1198         b_sendj++;
1199       }
1200     }
1201     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1202 
1203     /* -----------------------------------------------------------------
1204        Gather all numerical values to all processors
1205     */
1206     if (!recvcounts) {
1207       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1208     }
1209     for (i=0; i<size; i++) {
1210       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1211     }
1212     displs[0] = 0;
1213     for (i=1; i<size; i++) {
1214       displs[i] = displs[i-1] + recvcounts[i-1];
1215     }
1216     recvbuf = b->a;
1217 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1218     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1219 #else
1220     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1221 #endif
1222   }  /* endof (flag == MAT_GET_VALUES) */
1223   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1224 
1225   if (A->symmetric) {
1226     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1227   } else if (A->hermitian) {
1228     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1229   } else if (A->structurally_symmetric) {
1230     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1231   }
1232   PetscFunctionReturn(0);
1233 }
1234 
1235 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1236 {
1237   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1238   Mat            submat,A = c->A,B = c->B;
1239   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1240   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1241   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1242   const PetscInt *icol,*irow;
1243   PetscInt       nrow,ncol,start;
1244   PetscErrorCode ierr;
1245   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1246   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1247   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1248   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1249   PetscInt       *lens,rmax,ncols,*cols,Crow;
1250 #if defined(PETSC_USE_CTABLE)
1251   PetscTable     cmap,rmap;
1252   PetscInt       *cmap_loc,*rmap_loc;
1253 #else
1254   PetscInt       *cmap,*rmap;
1255 #endif
1256   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1257   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1258   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1259   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1260   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1261   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1262   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1263   MPI_Comm       comm;
1264   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1265   PetscMPIInt    *onodes1,*olengths1,idex,end;
1266   Mat_SubSppt    *smatis1;
1267   PetscBool      isrowsorted,iscolsorted;
1268 
1269   PetscFunctionBegin;
1270   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1271 
1272   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1273   size = c->size;
1274   rank = c->rank;
1275 
1276   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1277   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1278   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1279   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1280   if (allcolumns) {
1281     icol = NULL;
1282     ncol = C->cmap->N;
1283   } else {
1284     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1285     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1286   }
1287 
1288   if (scall == MAT_INITIAL_MATRIX) {
1289     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1290 
1291     /* Get some new tags to keep the communication clean */
1292     tag1 = ((PetscObject)C)->tag;
1293     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1294     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1295 
1296     /* evaluate communication - mesg to who, length of mesg, and buffer space
1297      required. Based on this, buffers are allocated, and data copied into them */
1298     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1299     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1300 
1301     /* w1[proc] = num of rows owned by proc -- to be requested */
1302     proc = 0;
1303     nrqs = 0; /* num of outgoing messages */
1304     for (j=0; j<nrow; j++) {
1305       row  = irow[j];
1306       if (!isrowsorted) proc = 0;
1307       while (row >= C->rmap->range[proc+1]) proc++;
1308       w1[proc]++;
1309       row2proc[j] = proc; /* map row index to proc */
1310 
1311       if (proc != rank && !w2[proc]) {
1312         w2[proc] = 1; nrqs++;
1313       }
1314     }
1315     w1[rank] = 0;  /* rows owned by self will not be requested */
1316 
1317     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1318     for (proc=0,j=0; proc<size; proc++) {
1319       if (w1[proc]) { pa[j++] = proc;}
1320     }
1321 
1322     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1323     msz = 0;              /* total mesg length (for all procs) */
1324     for (i=0; i<nrqs; i++) {
1325       proc      = pa[i];
1326       w1[proc] += 3;
1327       msz      += w1[proc];
1328     }
1329     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1330 
1331     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1332     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1333     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1334 
1335     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1336        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1337     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1338 
1339     /* Now post the Irecvs corresponding to these messages */
1340     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1341 
1342     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1343     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1344 
1345     /* Allocate Memory for outgoing messages */
1346     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1347     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1348     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1349 
1350     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1351     iptr = tmp;
1352     for (i=0; i<nrqs; i++) {
1353       proc        = pa[i];
1354       sbuf1[proc] = iptr;
1355       iptr       += w1[proc];
1356     }
1357 
1358     /* Form the outgoing messages */
1359     /* Initialize the header space */
1360     for (i=0; i<nrqs; i++) {
1361       proc      = pa[i];
1362       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1363       ptr[proc] = sbuf1[proc] + 3;
1364     }
1365 
1366     /* Parse the isrow and copy data into outbuf */
1367     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1368     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1369       proc = row2proc[j];
1370       if (proc != rank) { /* copy to the outgoing buf*/
1371         *ptr[proc] = irow[j];
1372         ctr[proc]++; ptr[proc]++;
1373       }
1374     }
1375 
1376     /* Update the headers for the current IS */
1377     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1378       if ((ctr_j = ctr[j])) {
1379         sbuf1_j        = sbuf1[j];
1380         k              = ++sbuf1_j[0];
1381         sbuf1_j[2*k]   = ctr_j;
1382         sbuf1_j[2*k-1] = 0;
1383       }
1384     }
1385 
1386     /* Now post the sends */
1387     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1388     for (i=0; i<nrqs; ++i) {
1389       proc = pa[i];
1390       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1391     }
1392 
1393     /* Post Receives to capture the buffer size */
1394     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1395     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1396 
1397     rbuf2[0] = tmp + msz;
1398     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1399 
1400     for (i=0; i<nrqs; ++i) {
1401       proc = pa[i];
1402       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1403     }
1404 
1405     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1406 
1407     /* Send to other procs the buf size they should allocate */
1408     /* Receive messages*/
1409     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1410     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1411 
1412     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1413     for (i=0; i<nrqr; ++i) {
1414       req_size[i] = 0;
1415       rbuf1_i        = rbuf1[i];
1416       start          = 2*rbuf1_i[0] + 1;
1417       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1418       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1419       sbuf2_i        = sbuf2[i];
1420       for (j=start; j<end; j++) {
1421         k            = rbuf1_i[j] - rstart;
1422         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1423         sbuf2_i[j]   = ncols;
1424         req_size[i] += ncols;
1425       }
1426       req_source1[i] = r_status1[i].MPI_SOURCE;
1427 
1428       /* form the header */
1429       sbuf2_i[0] = req_size[i];
1430       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1431 
1432       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1433     }
1434 
1435     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1436     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1437 
1438     /* rbuf2 is received, Post recv column indices a->j */
1439     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1440 
1441     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1442     for (i=0; i<nrqs; ++i) {
1443       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1444       req_source2[i] = r_status2[i].MPI_SOURCE;
1445       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1446     }
1447 
1448     /* Wait on sends1 and sends2 */
1449     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1450     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1451     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1452     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1453 
1454     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1455     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1456 
1457     /* Now allocate sending buffers for a->j, and send them off */
1458     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1459     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1460     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1461     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1462 
1463     for (i=0; i<nrqr; i++) { /* for each requested message */
1464       rbuf1_i   = rbuf1[i];
1465       sbuf_aj_i = sbuf_aj[i];
1466       ct1       = 2*rbuf1_i[0] + 1;
1467       ct2       = 0;
1468       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1469 
1470       kmax = rbuf1[i][2];
1471       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1472         row    = rbuf1_i[ct1] - rstart;
1473         nzA    = ai[row+1] - ai[row];
1474         nzB    = bi[row+1] - bi[row];
1475         ncols  = nzA + nzB;
1476         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1477 
1478         /* load the column indices for this row into cols*/
1479         cols = sbuf_aj_i + ct2;
1480 
1481         lwrite = 0;
1482         for (l=0; l<nzB; l++) {
1483           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1484         }
1485         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1486         for (l=0; l<nzB; l++) {
1487           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1488         }
1489 
1490         ct2 += ncols;
1491       }
1492       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1493     }
1494 
1495     /* create column map (cmap): global col of C -> local col of submat */
1496 #if defined(PETSC_USE_CTABLE)
1497     if (!allcolumns) {
1498       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1499       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1500       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1501         if (icol[j] >= cstart && icol[j] <cend) {
1502           cmap_loc[icol[j] - cstart] = j+1;
1503         } else { /* use PetscTable for non-local col indices */
1504           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1505         }
1506       }
1507     } else {
1508       cmap     = NULL;
1509       cmap_loc = NULL;
1510     }
1511     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1512 #else
1513     if (!allcolumns) {
1514       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1515       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1516     } else {
1517       cmap = NULL;
1518     }
1519 #endif
1520 
1521     /* Create lens for MatSeqAIJSetPreallocation() */
1522     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1523 
1524     /* Compute lens from local part of C */
1525     for (j=0; j<nrow; j++) {
1526       row  = irow[j];
1527       proc = row2proc[j];
1528       if (proc == rank) {
1529         /* diagonal part A = c->A */
1530         ncols = ai[row-rstart+1] - ai[row-rstart];
1531         cols  = aj + ai[row-rstart];
1532         if (!allcolumns) {
1533           for (k=0; k<ncols; k++) {
1534 #if defined(PETSC_USE_CTABLE)
1535             tcol = cmap_loc[cols[k]];
1536 #else
1537             tcol = cmap[cols[k]+cstart];
1538 #endif
1539             if (tcol) lens[j]++;
1540           }
1541         } else { /* allcolumns */
1542           lens[j] = ncols;
1543         }
1544 
1545         /* off-diagonal part B = c->B */
1546         ncols = bi[row-rstart+1] - bi[row-rstart];
1547         cols  = bj + bi[row-rstart];
1548         if (!allcolumns) {
1549           for (k=0; k<ncols; k++) {
1550 #if defined(PETSC_USE_CTABLE)
1551             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1552 #else
1553             tcol = cmap[bmap[cols[k]]];
1554 #endif
1555             if (tcol) lens[j]++;
1556           }
1557         } else { /* allcolumns */
1558           lens[j] += ncols;
1559         }
1560       }
1561     }
1562 
1563     /* Create row map (rmap): global row of C -> local row of submat */
1564 #if defined(PETSC_USE_CTABLE)
1565     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1566     for (j=0; j<nrow; j++) {
1567       row  = irow[j];
1568       proc = row2proc[j];
1569       if (proc == rank) { /* a local row */
1570         rmap_loc[row - rstart] = j;
1571       } else {
1572         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1573       }
1574     }
1575 #else
1576     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1577     for (j=0; j<nrow; j++) {
1578       rmap[irow[j]] = j;
1579     }
1580 #endif
1581 
1582     /* Update lens from offproc data */
1583     /* recv a->j is done */
1584     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1585     for (i=0; i<nrqs; i++) {
1586       proc    = pa[i];
1587       sbuf1_i = sbuf1[proc];
1588       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1589       ct1     = 2 + 1;
1590       ct2     = 0;
1591       rbuf2_i = rbuf2[i]; /* received length of C->j */
1592       rbuf3_i = rbuf3[i]; /* received C->j */
1593 
1594       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1595       max1   = sbuf1_i[2];
1596       for (k=0; k<max1; k++,ct1++) {
1597 #if defined(PETSC_USE_CTABLE)
1598         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1599         row--;
1600         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1601 #else
1602         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1603 #endif
1604         /* Now, store row index of submat in sbuf1_i[ct1] */
1605         sbuf1_i[ct1] = row;
1606 
1607         nnz = rbuf2_i[ct1];
1608         if (!allcolumns) {
1609           for (l=0; l<nnz; l++,ct2++) {
1610 #if defined(PETSC_USE_CTABLE)
1611             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1612               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1613             } else {
1614               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1615             }
1616 #else
1617             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1618 #endif
1619             if (tcol) lens[row]++;
1620           }
1621         } else { /* allcolumns */
1622           lens[row] += nnz;
1623         }
1624       }
1625     }
1626     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1627     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1628 
1629     /* Create the submatrices */
1630     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1631     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1632 
1633     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1634     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1635     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1636     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1637     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1638 
1639     /* create struct Mat_SubSppt and attached it to submat */
1640     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1641     subc = (Mat_SeqAIJ*)submat->data;
1642     subc->submatis1 = smatis1;
1643 
1644     smatis1->id          = 0;
1645     smatis1->nrqs        = nrqs;
1646     smatis1->nrqr        = nrqr;
1647     smatis1->rbuf1       = rbuf1;
1648     smatis1->rbuf2       = rbuf2;
1649     smatis1->rbuf3       = rbuf3;
1650     smatis1->sbuf2       = sbuf2;
1651     smatis1->req_source2 = req_source2;
1652 
1653     smatis1->sbuf1       = sbuf1;
1654     smatis1->ptr         = ptr;
1655     smatis1->tmp         = tmp;
1656     smatis1->ctr         = ctr;
1657 
1658     smatis1->pa           = pa;
1659     smatis1->req_size     = req_size;
1660     smatis1->req_source1  = req_source1;
1661 
1662     smatis1->allcolumns  = allcolumns;
1663     smatis1->singleis    = PETSC_TRUE;
1664     smatis1->row2proc    = row2proc;
1665     smatis1->rmap        = rmap;
1666     smatis1->cmap        = cmap;
1667 #if defined(PETSC_USE_CTABLE)
1668     smatis1->rmap_loc    = rmap_loc;
1669     smatis1->cmap_loc    = cmap_loc;
1670 #endif
1671 
1672     smatis1->destroy     = submat->ops->destroy;
1673     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1674     submat->factortype   = C->factortype;
1675 
1676     /* compute rmax */
1677     rmax = 0;
1678     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1679 
1680   } else { /* scall == MAT_REUSE_MATRIX */
1681     submat = submats[0];
1682     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1683 
1684     subc    = (Mat_SeqAIJ*)submat->data;
1685     rmax    = subc->rmax;
1686     smatis1 = subc->submatis1;
1687     nrqs        = smatis1->nrqs;
1688     nrqr        = smatis1->nrqr;
1689     rbuf1       = smatis1->rbuf1;
1690     rbuf2       = smatis1->rbuf2;
1691     rbuf3       = smatis1->rbuf3;
1692     req_source2 = smatis1->req_source2;
1693 
1694     sbuf1     = smatis1->sbuf1;
1695     sbuf2     = smatis1->sbuf2;
1696     ptr       = smatis1->ptr;
1697     tmp       = smatis1->tmp;
1698     ctr       = smatis1->ctr;
1699 
1700     pa         = smatis1->pa;
1701     req_size   = smatis1->req_size;
1702     req_source1 = smatis1->req_source1;
1703 
1704     allcolumns = smatis1->allcolumns;
1705     row2proc   = smatis1->row2proc;
1706     rmap       = smatis1->rmap;
1707     cmap       = smatis1->cmap;
1708 #if defined(PETSC_USE_CTABLE)
1709     rmap_loc   = smatis1->rmap_loc;
1710     cmap_loc   = smatis1->cmap_loc;
1711 #endif
1712   }
1713 
1714   /* Post recv matrix values */
1715   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1716   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1717   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1718   for (i=0; i<nrqs; ++i) {
1719     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1720     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1721   }
1722 
1723   /* Allocate sending buffers for a->a, and send them off */
1724   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1725   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1726   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1727   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1728 
1729   for (i=0; i<nrqr; i++) {
1730     rbuf1_i   = rbuf1[i];
1731     sbuf_aa_i = sbuf_aa[i];
1732     ct1       = 2*rbuf1_i[0]+1;
1733     ct2       = 0;
1734     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1735 
1736     kmax = rbuf1_i[2];
1737     for (k=0; k<kmax; k++,ct1++) {
1738       row = rbuf1_i[ct1] - rstart;
1739       nzA = ai[row+1] - ai[row];
1740       nzB = bi[row+1] - bi[row];
1741       ncols  = nzA + nzB;
1742       cworkB = bj + bi[row];
1743       vworkA = a_a + ai[row];
1744       vworkB = b_a + bi[row];
1745 
1746       /* load the column values for this row into vals*/
1747       vals = sbuf_aa_i + ct2;
1748 
1749       lwrite = 0;
1750       for (l=0; l<nzB; l++) {
1751         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1752       }
1753       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1754       for (l=0; l<nzB; l++) {
1755         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1756       }
1757 
1758       ct2 += ncols;
1759     }
1760     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1761   }
1762 
1763   /* Assemble submat */
1764   /* First assemble the local rows */
1765   for (j=0; j<nrow; j++) {
1766     row  = irow[j];
1767     proc = row2proc[j];
1768     if (proc == rank) {
1769       Crow = row - rstart;  /* local row index of C */
1770 #if defined(PETSC_USE_CTABLE)
1771       row = rmap_loc[Crow]; /* row index of submat */
1772 #else
1773       row = rmap[row];
1774 #endif
1775 
1776       if (allcolumns) {
1777         /* diagonal part A = c->A */
1778         ncols = ai[Crow+1] - ai[Crow];
1779         cols  = aj + ai[Crow];
1780         vals  = a->a + ai[Crow];
1781         i     = 0;
1782         for (k=0; k<ncols; k++) {
1783           subcols[i]   = cols[k] + cstart;
1784           subvals[i++] = vals[k];
1785         }
1786 
1787         /* off-diagonal part B = c->B */
1788         ncols = bi[Crow+1] - bi[Crow];
1789         cols  = bj + bi[Crow];
1790         vals  = b->a + bi[Crow];
1791         for (k=0; k<ncols; k++) {
1792           subcols[i]   = bmap[cols[k]];
1793           subvals[i++] = vals[k];
1794         }
1795 
1796         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1797 
1798       } else { /* !allcolumns */
1799 #if defined(PETSC_USE_CTABLE)
1800         /* diagonal part A = c->A */
1801         ncols = ai[Crow+1] - ai[Crow];
1802         cols  = aj + ai[Crow];
1803         vals  = a->a + ai[Crow];
1804         i     = 0;
1805         for (k=0; k<ncols; k++) {
1806           tcol = cmap_loc[cols[k]];
1807           if (tcol) {
1808             subcols[i]   = --tcol;
1809             subvals[i++] = vals[k];
1810           }
1811         }
1812 
1813         /* off-diagonal part B = c->B */
1814         ncols = bi[Crow+1] - bi[Crow];
1815         cols  = bj + bi[Crow];
1816         vals  = b->a + bi[Crow];
1817         for (k=0; k<ncols; k++) {
1818           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1819           if (tcol) {
1820             subcols[i]   = --tcol;
1821             subvals[i++] = vals[k];
1822           }
1823         }
1824 #else
1825         /* diagonal part A = c->A */
1826         ncols = ai[Crow+1] - ai[Crow];
1827         cols  = aj + ai[Crow];
1828         vals  = a->a + ai[Crow];
1829         i     = 0;
1830         for (k=0; k<ncols; k++) {
1831           tcol = cmap[cols[k]+cstart];
1832           if (tcol) {
1833             subcols[i]   = --tcol;
1834             subvals[i++] = vals[k];
1835           }
1836         }
1837 
1838         /* off-diagonal part B = c->B */
1839         ncols = bi[Crow+1] - bi[Crow];
1840         cols  = bj + bi[Crow];
1841         vals  = b->a + bi[Crow];
1842         for (k=0; k<ncols; k++) {
1843           tcol = cmap[bmap[cols[k]]];
1844           if (tcol) {
1845             subcols[i]   = --tcol;
1846             subvals[i++] = vals[k];
1847           }
1848         }
1849 #endif
1850         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1851       }
1852     }
1853   }
1854 
1855   /* Now assemble the off-proc rows */
1856   for (i=0; i<nrqs; i++) { /* for each requested message */
1857     /* recv values from other processes */
1858     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1859     proc    = pa[idex];
1860     sbuf1_i = sbuf1[proc];
1861     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1862     ct1     = 2 + 1;
1863     ct2     = 0; /* count of received C->j */
1864     ct3     = 0; /* count of received C->j that will be inserted into submat */
1865     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1866     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1867     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1868 
1869     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1870     max1 = sbuf1_i[2];             /* num of rows */
1871     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1872       row = sbuf1_i[ct1]; /* row index of submat */
1873       if (!allcolumns) {
1874         idex = 0;
1875         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1876           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1877           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1878 #if defined(PETSC_USE_CTABLE)
1879             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1880               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1881             } else {
1882               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1883             }
1884 #else
1885             tcol = cmap[rbuf3_i[ct2]];
1886 #endif
1887             if (tcol) {
1888               subcols[idex]   = --tcol; /* may not be sorted */
1889               subvals[idex++] = rbuf4_i[ct2];
1890 
1891               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1892                For reuse, we replace received C->j with index that should be inserted to submat */
1893               rbuf3_i[ct3++] = ct2;
1894             }
1895           }
1896           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1897         } else { /* scall == MAT_REUSE_MATRIX */
1898           submat = submats[0];
1899           subc   = (Mat_SeqAIJ*)submat->data;
1900 
1901           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1902           for (l=0; l<nnz; l++) {
1903             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1904             subvals[idex++] = rbuf4_i[ct2];
1905           }
1906 
1907           bj = subc->j + subc->i[row]; /* sorted column indices */
1908           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1909         }
1910       } else { /* allcolumns */
1911         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1912         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1913         ct2 += nnz;
1914       }
1915     }
1916   }
1917 
1918   /* sending a->a are done */
1919   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1920   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1921 
1922   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1923   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1924   submats[0] = submat;
1925 
1926   /* Restore the indices */
1927   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1928   if (!allcolumns) {
1929     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1930   }
1931 
1932   /* Destroy allocated memory */
1933   for (i=0; i<nrqs; ++i) {
1934     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1935   }
1936   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1937   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1938   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1939 
1940   if (scall == MAT_INITIAL_MATRIX) {
1941     ierr = PetscFree(lens);CHKERRQ(ierr);
1942     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1943     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1944   }
1945   PetscFunctionReturn(0);
1946 }
1947 
1948 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1949 {
1950   PetscErrorCode ierr;
1951   PetscInt       ncol;
1952   PetscBool      colflag,allcolumns=PETSC_FALSE;
1953 
1954   PetscFunctionBegin;
1955   /* Allocate memory to hold all the submatrices */
1956   if (scall == MAT_INITIAL_MATRIX) {
1957     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1958   }
1959 
1960   /* Check for special case: each processor gets entire matrix columns */
1961   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1962   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1963   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1964 
1965   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1966   PetscFunctionReturn(0);
1967 }
1968 
1969 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1970 {
1971   PetscErrorCode ierr;
1972   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1973   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1974   Mat_SeqAIJ     *subc;
1975   Mat_SubSppt    *smat;
1976 
1977   PetscFunctionBegin;
1978   /* Check for special case: each processor has a single IS */
1979   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1980     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1981     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1982     PetscFunctionReturn(0);
1983   }
1984 
1985   /* Collect global wantallmatrix and nstages */
1986   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1987   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1988   if (!nmax) nmax = 1;
1989 
1990   if (scall == MAT_INITIAL_MATRIX) {
1991     /* Collect global wantallmatrix and nstages */
1992     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1993       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1994       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1995       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1996       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1997       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1998         wantallmatrix = PETSC_TRUE;
1999 
2000         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2001       }
2002     }
2003 
2004     /* Determine the number of stages through which submatrices are done
2005        Each stage will extract nmax submatrices.
2006        nmax is determined by the matrix column dimension.
2007        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2008     */
2009     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2010 
2011     in[0] = -1*(PetscInt)wantallmatrix;
2012     in[1] = nstages;
2013     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2014     wantallmatrix = (PetscBool)(-out[0]);
2015     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2016 
2017   } else { /* MAT_REUSE_MATRIX */
2018     if (ismax) {
2019       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2020       smat = subc->submatis1;
2021     } else { /* (*submat)[0] is a dummy matrix */
2022       smat = (Mat_SubSppt*)(*submat)[0]->data;
2023     }
2024     if (!smat) {
2025       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2026       wantallmatrix = PETSC_TRUE;
2027     } else if (smat->singleis) {
2028       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2029       PetscFunctionReturn(0);
2030     } else {
2031       nstages = smat->nstages;
2032     }
2033   }
2034 
2035   if (wantallmatrix) {
2036     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2037     PetscFunctionReturn(0);
2038   }
2039 
2040   /* Allocate memory to hold all the submatrices and dummy submatrices */
2041   if (scall == MAT_INITIAL_MATRIX) {
2042     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2043   }
2044 
2045   for (i=0,pos=0; i<nstages; i++) {
2046     if (pos+nmax <= ismax) max_no = nmax;
2047     else if (pos == ismax) max_no = 0;
2048     else                   max_no = ismax-pos;
2049 
2050     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2051     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2052       smat = (Mat_SubSppt*)(*submat)[pos]->data;
2053       smat->nstages = nstages;
2054     }
2055     pos += max_no;
2056   }
2057 
2058   if (ismax && scall == MAT_INITIAL_MATRIX) {
2059     /* save nstages for reuse */
2060     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2061     smat = subc->submatis1;
2062     smat->nstages = nstages;
2063   }
2064   PetscFunctionReturn(0);
2065 }
2066 
2067 /* -------------------------------------------------------------------------*/
2068 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2069 {
2070   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2071   Mat            A  = c->A;
2072   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2073   const PetscInt **icol,**irow;
2074   PetscInt       *nrow,*ncol,start;
2075   PetscErrorCode ierr;
2076   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2077   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2078   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2079   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2080   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2081 #if defined(PETSC_USE_CTABLE)
2082   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2083 #else
2084   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2085 #endif
2086   const PetscInt *irow_i;
2087   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2088   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2089   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2090   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2091   MPI_Status     *r_status3,*r_status4,*s_status4;
2092   MPI_Comm       comm;
2093   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2094   PetscMPIInt    *onodes1,*olengths1,end;
2095   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2096   Mat_SubSppt    *smat_i;
2097   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2098   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2099 
2100   PetscFunctionBegin;
2101   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2102   size = c->size;
2103   rank = c->rank;
2104 
2105   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2106   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2107 
2108   for (i=0; i<ismax; i++) {
2109     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2110     if (!issorted[i]) iscsorted = issorted[i];
2111 
2112     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2113 
2114     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2115     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2116 
2117     /* Check for special case: allcolumn */
2118     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2119     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2120     if (colflag && ncol[i] == C->cmap->N) {
2121       allcolumns[i] = PETSC_TRUE;
2122       icol[i] = NULL;
2123     } else {
2124       allcolumns[i] = PETSC_FALSE;
2125       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2126     }
2127   }
2128 
2129   if (scall == MAT_REUSE_MATRIX) {
2130     /* Assumes new rows are same length as the old rows */
2131     for (i=0; i<ismax; i++) {
2132       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2133       subc = (Mat_SeqAIJ*)submats[i]->data;
2134       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2135 
2136       /* Initial matrix as if empty */
2137       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2138 
2139       smat_i   = subc->submatis1;
2140 
2141       nrqs        = smat_i->nrqs;
2142       nrqr        = smat_i->nrqr;
2143       rbuf1       = smat_i->rbuf1;
2144       rbuf2       = smat_i->rbuf2;
2145       rbuf3       = smat_i->rbuf3;
2146       req_source2 = smat_i->req_source2;
2147 
2148       sbuf1     = smat_i->sbuf1;
2149       sbuf2     = smat_i->sbuf2;
2150       ptr       = smat_i->ptr;
2151       tmp       = smat_i->tmp;
2152       ctr       = smat_i->ctr;
2153 
2154       pa          = smat_i->pa;
2155       req_size    = smat_i->req_size;
2156       req_source1 = smat_i->req_source1;
2157 
2158       allcolumns[i] = smat_i->allcolumns;
2159       row2proc[i]   = smat_i->row2proc;
2160       rmap[i]       = smat_i->rmap;
2161       cmap[i]       = smat_i->cmap;
2162     }
2163 
2164     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2165       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2166       smat_i = (Mat_SubSppt*)submats[0]->data;
2167 
2168       nrqs        = smat_i->nrqs;
2169       nrqr        = smat_i->nrqr;
2170       rbuf1       = smat_i->rbuf1;
2171       rbuf2       = smat_i->rbuf2;
2172       rbuf3       = smat_i->rbuf3;
2173       req_source2 = smat_i->req_source2;
2174 
2175       sbuf1       = smat_i->sbuf1;
2176       sbuf2       = smat_i->sbuf2;
2177       ptr         = smat_i->ptr;
2178       tmp         = smat_i->tmp;
2179       ctr         = smat_i->ctr;
2180 
2181       pa          = smat_i->pa;
2182       req_size    = smat_i->req_size;
2183       req_source1 = smat_i->req_source1;
2184 
2185       allcolumns[0] = PETSC_FALSE;
2186     }
2187   } else { /* scall == MAT_INITIAL_MATRIX */
2188     /* Get some new tags to keep the communication clean */
2189     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2190     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2191 
2192     /* evaluate communication - mesg to who, length of mesg, and buffer space
2193      required. Based on this, buffers are allocated, and data copied into them*/
2194     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2195 
2196     for (i=0; i<ismax; i++) {
2197       jmax   = nrow[i];
2198       irow_i = irow[i];
2199 
2200       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2201       row2proc[i] = row2proc_i;
2202 
2203       if (issorted[i]) proc = 0;
2204       for (j=0; j<jmax; j++) {
2205         if (!issorted[i]) proc = 0;
2206         row = irow_i[j];
2207         while (row >= C->rmap->range[proc+1]) proc++;
2208         w4[proc]++;
2209         row2proc_i[j] = proc; /* map row index to proc */
2210       }
2211       for (j=0; j<size; j++) {
2212         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2213       }
2214     }
2215 
2216     nrqs     = 0;              /* no of outgoing messages */
2217     msz      = 0;              /* total mesg length (for all procs) */
2218     w1[rank] = 0;              /* no mesg sent to self */
2219     w3[rank] = 0;
2220     for (i=0; i<size; i++) {
2221       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2222     }
2223     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2224     for (i=0,j=0; i<size; i++) {
2225       if (w1[i]) { pa[j] = i; j++; }
2226     }
2227 
2228     /* Each message would have a header = 1 + 2*(no of IS) + data */
2229     for (i=0; i<nrqs; i++) {
2230       j      = pa[i];
2231       w1[j] += w2[j] + 2* w3[j];
2232       msz   += w1[j];
2233     }
2234     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2235 
2236     /* Determine the number of messages to expect, their lengths, from from-ids */
2237     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2238     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2239 
2240     /* Now post the Irecvs corresponding to these messages */
2241     tag0 = ((PetscObject)C)->tag;
2242     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2243 
2244     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2245     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2246 
2247     /* Allocate Memory for outgoing messages */
2248     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2249     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2250     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2251 
2252     {
2253       PetscInt *iptr = tmp;
2254       k    = 0;
2255       for (i=0; i<nrqs; i++) {
2256         j        = pa[i];
2257         iptr    += k;
2258         sbuf1[j] = iptr;
2259         k        = w1[j];
2260       }
2261     }
2262 
2263     /* Form the outgoing messages. Initialize the header space */
2264     for (i=0; i<nrqs; i++) {
2265       j           = pa[i];
2266       sbuf1[j][0] = 0;
2267       ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2268       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2269     }
2270 
2271     /* Parse the isrow and copy data into outbuf */
2272     for (i=0; i<ismax; i++) {
2273       row2proc_i = row2proc[i];
2274       ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2275       irow_i = irow[i];
2276       jmax   = nrow[i];
2277       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2278         proc = row2proc_i[j];
2279         if (proc != rank) { /* copy to the outgoing buf*/
2280           ctr[proc]++;
2281           *ptr[proc] = irow_i[j];
2282           ptr[proc]++;
2283         }
2284       }
2285       /* Update the headers for the current IS */
2286       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2287         if ((ctr_j = ctr[j])) {
2288           sbuf1_j        = sbuf1[j];
2289           k              = ++sbuf1_j[0];
2290           sbuf1_j[2*k]   = ctr_j;
2291           sbuf1_j[2*k-1] = i;
2292         }
2293       }
2294     }
2295 
2296     /*  Now  post the sends */
2297     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2298     for (i=0; i<nrqs; ++i) {
2299       j    = pa[i];
2300       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2301     }
2302 
2303     /* Post Receives to capture the buffer size */
2304     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2305     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2306     rbuf2[0] = tmp + msz;
2307     for (i=1; i<nrqs; ++i) {
2308       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2309     }
2310     for (i=0; i<nrqs; ++i) {
2311       j    = pa[i];
2312       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2313     }
2314 
2315     /* Send to other procs the buf size they should allocate */
2316     /* Receive messages*/
2317     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2318     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2319     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2320     {
2321       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2322       PetscInt   *sbuf2_i;
2323 
2324       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2325       for (i=0; i<nrqr; ++i) {
2326         req_size[i] = 0;
2327         rbuf1_i        = rbuf1[i];
2328         start          = 2*rbuf1_i[0] + 1;
2329         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2330         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2331         sbuf2_i        = sbuf2[i];
2332         for (j=start; j<end; j++) {
2333           id              = rbuf1_i[j] - rstart;
2334           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2335           sbuf2_i[j]      = ncols;
2336           req_size[i] += ncols;
2337         }
2338         req_source1[i] = r_status1[i].MPI_SOURCE;
2339         /* form the header */
2340         sbuf2_i[0] = req_size[i];
2341         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2342 
2343         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2344       }
2345     }
2346     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2347     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2348     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2349 
2350     /* Receive messages*/
2351     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2352     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2353 
2354     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2355     for (i=0; i<nrqs; ++i) {
2356       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2357       req_source2[i] = r_status2[i].MPI_SOURCE;
2358       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2359     }
2360     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2361     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2362 
2363     /* Wait on sends1 and sends2 */
2364     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2365     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2366 
2367     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2368     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2369     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2370     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2371     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2372     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2373 
2374     /* Now allocate sending buffers for a->j, and send them off */
2375     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2376     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2377     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2378     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2379 
2380     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2381     {
2382       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2383       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2384       PetscInt cend = C->cmap->rend;
2385       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2386 
2387       for (i=0; i<nrqr; i++) {
2388         rbuf1_i   = rbuf1[i];
2389         sbuf_aj_i = sbuf_aj[i];
2390         ct1       = 2*rbuf1_i[0] + 1;
2391         ct2       = 0;
2392         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2393           kmax = rbuf1[i][2*j];
2394           for (k=0; k<kmax; k++,ct1++) {
2395             row    = rbuf1_i[ct1] - rstart;
2396             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2397             ncols  = nzA + nzB;
2398             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2399 
2400             /* load the column indices for this row into cols */
2401             cols = sbuf_aj_i + ct2;
2402 
2403             lwrite = 0;
2404             for (l=0; l<nzB; l++) {
2405               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2406             }
2407             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2408             for (l=0; l<nzB; l++) {
2409               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2410             }
2411 
2412             ct2 += ncols;
2413           }
2414         }
2415         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2416       }
2417     }
2418     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2419 
2420     /* create col map: global col of C -> local col of submatrices */
2421     {
2422       const PetscInt *icol_i;
2423 #if defined(PETSC_USE_CTABLE)
2424       for (i=0; i<ismax; i++) {
2425         if (!allcolumns[i]) {
2426           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2427 
2428           jmax   = ncol[i];
2429           icol_i = icol[i];
2430           cmap_i = cmap[i];
2431           for (j=0; j<jmax; j++) {
2432             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2433           }
2434         } else cmap[i] = NULL;
2435       }
2436 #else
2437       for (i=0; i<ismax; i++) {
2438         if (!allcolumns[i]) {
2439           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2440           jmax   = ncol[i];
2441           icol_i = icol[i];
2442           cmap_i = cmap[i];
2443           for (j=0; j<jmax; j++) {
2444             cmap_i[icol_i[j]] = j+1;
2445           }
2446         } else cmap[i] = NULL;
2447       }
2448 #endif
2449     }
2450 
2451     /* Create lens which is required for MatCreate... */
2452     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2453     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2454 
2455     if (ismax) {
2456       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2457     }
2458     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2459 
2460     /* Update lens from local data */
2461     for (i=0; i<ismax; i++) {
2462       row2proc_i = row2proc[i];
2463       jmax = nrow[i];
2464       if (!allcolumns[i]) cmap_i = cmap[i];
2465       irow_i = irow[i];
2466       lens_i = lens[i];
2467       for (j=0; j<jmax; j++) {
2468         row = irow_i[j];
2469         proc = row2proc_i[j];
2470         if (proc == rank) {
2471           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2472           if (!allcolumns[i]) {
2473             for (k=0; k<ncols; k++) {
2474 #if defined(PETSC_USE_CTABLE)
2475               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2476 #else
2477               tcol = cmap_i[cols[k]];
2478 #endif
2479               if (tcol) lens_i[j]++;
2480             }
2481           } else { /* allcolumns */
2482             lens_i[j] = ncols;
2483           }
2484           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2485         }
2486       }
2487     }
2488 
2489     /* Create row map: global row of C -> local row of submatrices */
2490 #if defined(PETSC_USE_CTABLE)
2491     for (i=0; i<ismax; i++) {
2492       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2493       irow_i = irow[i];
2494       jmax   = nrow[i];
2495       for (j=0; j<jmax; j++) {
2496       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2497       }
2498     }
2499 #else
2500     for (i=0; i<ismax; i++) {
2501       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2502       rmap_i = rmap[i];
2503       irow_i = irow[i];
2504       jmax   = nrow[i];
2505       for (j=0; j<jmax; j++) {
2506         rmap_i[irow_i[j]] = j;
2507       }
2508     }
2509 #endif
2510 
2511     /* Update lens from offproc data */
2512     {
2513       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2514 
2515       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2516       for (tmp2=0; tmp2<nrqs; tmp2++) {
2517         sbuf1_i = sbuf1[pa[tmp2]];
2518         jmax    = sbuf1_i[0];
2519         ct1     = 2*jmax+1;
2520         ct2     = 0;
2521         rbuf2_i = rbuf2[tmp2];
2522         rbuf3_i = rbuf3[tmp2];
2523         for (j=1; j<=jmax; j++) {
2524           is_no  = sbuf1_i[2*j-1];
2525           max1   = sbuf1_i[2*j];
2526           lens_i = lens[is_no];
2527           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2528           rmap_i = rmap[is_no];
2529           for (k=0; k<max1; k++,ct1++) {
2530 #if defined(PETSC_USE_CTABLE)
2531             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2532             row--;
2533             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2534 #else
2535             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2536 #endif
2537             max2 = rbuf2_i[ct1];
2538             for (l=0; l<max2; l++,ct2++) {
2539               if (!allcolumns[is_no]) {
2540 #if defined(PETSC_USE_CTABLE)
2541                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2542 #else
2543                 tcol = cmap_i[rbuf3_i[ct2]];
2544 #endif
2545                 if (tcol) lens_i[row]++;
2546               } else { /* allcolumns */
2547                 lens_i[row]++; /* lens_i[row] += max2 ? */
2548               }
2549             }
2550           }
2551         }
2552       }
2553     }
2554     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2555     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2556     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2557     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2558 
2559     /* Create the submatrices */
2560     for (i=0; i<ismax; i++) {
2561       PetscInt    rbs,cbs;
2562 
2563       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2564       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2565 
2566       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2567       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2568 
2569       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2570       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2571       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2572 
2573       /* create struct Mat_SubSppt and attached it to submat */
2574       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2575       subc = (Mat_SeqAIJ*)submats[i]->data;
2576       subc->submatis1 = smat_i;
2577 
2578       smat_i->destroy          = submats[i]->ops->destroy;
2579       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2580       submats[i]->factortype   = C->factortype;
2581 
2582       smat_i->id          = i;
2583       smat_i->nrqs        = nrqs;
2584       smat_i->nrqr        = nrqr;
2585       smat_i->rbuf1       = rbuf1;
2586       smat_i->rbuf2       = rbuf2;
2587       smat_i->rbuf3       = rbuf3;
2588       smat_i->sbuf2       = sbuf2;
2589       smat_i->req_source2 = req_source2;
2590 
2591       smat_i->sbuf1       = sbuf1;
2592       smat_i->ptr         = ptr;
2593       smat_i->tmp         = tmp;
2594       smat_i->ctr         = ctr;
2595 
2596       smat_i->pa           = pa;
2597       smat_i->req_size     = req_size;
2598       smat_i->req_source1  = req_source1;
2599 
2600       smat_i->allcolumns  = allcolumns[i];
2601       smat_i->singleis    = PETSC_FALSE;
2602       smat_i->row2proc    = row2proc[i];
2603       smat_i->rmap        = rmap[i];
2604       smat_i->cmap        = cmap[i];
2605     }
2606 
2607     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2608       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2609       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2610       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2611 
2612       /* create struct Mat_SubSppt and attached it to submat */
2613       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2614       submats[0]->data = (void*)smat_i;
2615 
2616       smat_i->destroy          = submats[0]->ops->destroy;
2617       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2618       submats[0]->factortype   = C->factortype;
2619 
2620       smat_i->id          = 0;
2621       smat_i->nrqs        = nrqs;
2622       smat_i->nrqr        = nrqr;
2623       smat_i->rbuf1       = rbuf1;
2624       smat_i->rbuf2       = rbuf2;
2625       smat_i->rbuf3       = rbuf3;
2626       smat_i->sbuf2       = sbuf2;
2627       smat_i->req_source2 = req_source2;
2628 
2629       smat_i->sbuf1       = sbuf1;
2630       smat_i->ptr         = ptr;
2631       smat_i->tmp         = tmp;
2632       smat_i->ctr         = ctr;
2633 
2634       smat_i->pa           = pa;
2635       smat_i->req_size     = req_size;
2636       smat_i->req_source1  = req_source1;
2637 
2638       smat_i->allcolumns  = PETSC_FALSE;
2639       smat_i->singleis    = PETSC_FALSE;
2640       smat_i->row2proc    = NULL;
2641       smat_i->rmap        = NULL;
2642       smat_i->cmap        = NULL;
2643     }
2644 
2645     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2646     ierr = PetscFree(lens);CHKERRQ(ierr);
2647     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2648     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2649 
2650   } /* endof scall == MAT_INITIAL_MATRIX */
2651 
2652   /* Post recv matrix values */
2653   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2654   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2655   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2656   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2657   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2658   for (i=0; i<nrqs; ++i) {
2659     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2660     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2661   }
2662 
2663   /* Allocate sending buffers for a->a, and send them off */
2664   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2665   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2666   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2667   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2668 
2669   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2670   {
2671     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2672     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2673     PetscInt    cend   = C->cmap->rend;
2674     PetscInt    *b_j   = b->j;
2675     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2676 
2677     for (i=0; i<nrqr; i++) {
2678       rbuf1_i   = rbuf1[i];
2679       sbuf_aa_i = sbuf_aa[i];
2680       ct1       = 2*rbuf1_i[0]+1;
2681       ct2       = 0;
2682       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2683         kmax = rbuf1_i[2*j];
2684         for (k=0; k<kmax; k++,ct1++) {
2685           row    = rbuf1_i[ct1] - rstart;
2686           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2687           ncols  = nzA + nzB;
2688           cworkB = b_j + b_i[row];
2689           vworkA = a_a + a_i[row];
2690           vworkB = b_a + b_i[row];
2691 
2692           /* load the column values for this row into vals*/
2693           vals = sbuf_aa_i+ct2;
2694 
2695           lwrite = 0;
2696           for (l=0; l<nzB; l++) {
2697             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2698           }
2699           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2700           for (l=0; l<nzB; l++) {
2701             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2702           }
2703 
2704           ct2 += ncols;
2705         }
2706       }
2707       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2708     }
2709   }
2710 
2711   /* Assemble the matrices */
2712   /* First assemble the local rows */
2713   for (i=0; i<ismax; i++) {
2714     row2proc_i = row2proc[i];
2715     subc      = (Mat_SeqAIJ*)submats[i]->data;
2716     imat_ilen = subc->ilen;
2717     imat_j    = subc->j;
2718     imat_i    = subc->i;
2719     imat_a    = subc->a;
2720 
2721     if (!allcolumns[i]) cmap_i = cmap[i];
2722     rmap_i = rmap[i];
2723     irow_i = irow[i];
2724     jmax   = nrow[i];
2725     for (j=0; j<jmax; j++) {
2726       row  = irow_i[j];
2727       proc = row2proc_i[j];
2728       if (proc == rank) {
2729         old_row = row;
2730 #if defined(PETSC_USE_CTABLE)
2731         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2732         row--;
2733 #else
2734         row = rmap_i[row];
2735 #endif
2736         ilen_row = imat_ilen[row];
2737         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2738         mat_i    = imat_i[row];
2739         mat_a    = imat_a + mat_i;
2740         mat_j    = imat_j + mat_i;
2741         if (!allcolumns[i]) {
2742           for (k=0; k<ncols; k++) {
2743 #if defined(PETSC_USE_CTABLE)
2744             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2745 #else
2746             tcol = cmap_i[cols[k]];
2747 #endif
2748             if (tcol) {
2749               *mat_j++ = tcol - 1;
2750               *mat_a++ = vals[k];
2751               ilen_row++;
2752             }
2753           }
2754         } else { /* allcolumns */
2755           for (k=0; k<ncols; k++) {
2756             *mat_j++ = cols[k];  /* global col index! */
2757             *mat_a++ = vals[k];
2758             ilen_row++;
2759           }
2760         }
2761         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2762 
2763         imat_ilen[row] = ilen_row;
2764       }
2765     }
2766   }
2767 
2768   /* Now assemble the off proc rows */
2769   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2770   for (tmp2=0; tmp2<nrqs; tmp2++) {
2771     sbuf1_i = sbuf1[pa[tmp2]];
2772     jmax    = sbuf1_i[0];
2773     ct1     = 2*jmax + 1;
2774     ct2     = 0;
2775     rbuf2_i = rbuf2[tmp2];
2776     rbuf3_i = rbuf3[tmp2];
2777     rbuf4_i = rbuf4[tmp2];
2778     for (j=1; j<=jmax; j++) {
2779       is_no     = sbuf1_i[2*j-1];
2780       rmap_i    = rmap[is_no];
2781       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2782       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2783       imat_ilen = subc->ilen;
2784       imat_j    = subc->j;
2785       imat_i    = subc->i;
2786       imat_a    = subc->a;
2787       max1      = sbuf1_i[2*j];
2788       for (k=0; k<max1; k++,ct1++) {
2789         row = sbuf1_i[ct1];
2790 #if defined(PETSC_USE_CTABLE)
2791         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2792         row--;
2793 #else
2794         row = rmap_i[row];
2795 #endif
2796         ilen  = imat_ilen[row];
2797         mat_i = imat_i[row];
2798         mat_a = imat_a + mat_i;
2799         mat_j = imat_j + mat_i;
2800         max2  = rbuf2_i[ct1];
2801         if (!allcolumns[is_no]) {
2802           for (l=0; l<max2; l++,ct2++) {
2803 #if defined(PETSC_USE_CTABLE)
2804             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2805 #else
2806             tcol = cmap_i[rbuf3_i[ct2]];
2807 #endif
2808             if (tcol) {
2809               *mat_j++ = tcol - 1;
2810               *mat_a++ = rbuf4_i[ct2];
2811               ilen++;
2812             }
2813           }
2814         } else { /* allcolumns */
2815           for (l=0; l<max2; l++,ct2++) {
2816             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2817             *mat_a++ = rbuf4_i[ct2];
2818             ilen++;
2819           }
2820         }
2821         imat_ilen[row] = ilen;
2822       }
2823     }
2824   }
2825 
2826   if (!iscsorted) { /* sort column indices of the rows */
2827     for (i=0; i<ismax; i++) {
2828       subc      = (Mat_SeqAIJ*)submats[i]->data;
2829       imat_j    = subc->j;
2830       imat_i    = subc->i;
2831       imat_a    = subc->a;
2832       imat_ilen = subc->ilen;
2833 
2834       if (allcolumns[i]) continue;
2835       jmax = nrow[i];
2836       for (j=0; j<jmax; j++) {
2837         mat_i = imat_i[j];
2838         mat_a = imat_a + mat_i;
2839         mat_j = imat_j + mat_i;
2840         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2841       }
2842     }
2843   }
2844 
2845   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2846   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2847   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2848   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2849   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2850 
2851   /* Restore the indices */
2852   for (i=0; i<ismax; i++) {
2853     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2854     if (!allcolumns[i]) {
2855       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2856     }
2857   }
2858 
2859   for (i=0; i<ismax; i++) {
2860     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2861     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2862   }
2863 
2864   /* Destroy allocated memory */
2865   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2866   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2867   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2868 
2869   for (i=0; i<nrqs; ++i) {
2870     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2871   }
2872   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2873 
2874   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2875   PetscFunctionReturn(0);
2876 }
2877 
2878 /*
2879  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2880  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2881  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2882  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2883  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2884  state, and needs to be "assembled" later by compressing B's column space.
2885 
2886  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2887  Following this call, C->A & C->B have been created, even if empty.
2888  */
2889 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2890 {
2891   /* If making this function public, change the error returned in this function away from _PLIB. */
2892   PetscErrorCode ierr;
2893   Mat_MPIAIJ     *aij;
2894   Mat_SeqAIJ     *Baij;
2895   PetscBool      seqaij,Bdisassembled;
2896   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2897   PetscScalar    v;
2898   const PetscInt *rowindices,*colindices;
2899 
2900   PetscFunctionBegin;
2901   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2902   if (A) {
2903     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2904     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2905     if (rowemb) {
2906       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2907       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2908     } else {
2909       if (C->rmap->n != A->rmap->n) {
2910 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2911       }
2912     }
2913     if (dcolemb) {
2914       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2915       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2916     } else {
2917       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2918     }
2919   }
2920   if (B) {
2921     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2922     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2923     if (rowemb) {
2924       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2925       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2926     } else {
2927       if (C->rmap->n != B->rmap->n) {
2928 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2929       }
2930     }
2931     if (ocolemb) {
2932       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2933       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2934     } else {
2935       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2936     }
2937   }
2938 
2939   aij = (Mat_MPIAIJ*)C->data;
2940   if (!aij->A) {
2941     /* Mimic parts of MatMPIAIJSetPreallocation() */
2942     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2943     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2944     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2945     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2946     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2947   }
2948   if (A) {
2949     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2950   } else {
2951     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2952   }
2953   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2954     /*
2955       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2956       need to "disassemble" B -- convert it to using C's global indices.
2957       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2958 
2959       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2960       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2961 
2962       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2963       At least avoid calling MatSetValues() and the implied searches?
2964     */
2965 
2966     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2967 #if defined(PETSC_USE_CTABLE)
2968       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2969 #else
2970       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2971       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2972       if (aij->B) {
2973         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2974       }
2975 #endif
2976       ngcol = 0;
2977       if (aij->lvec) {
2978 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2979       }
2980       if (aij->garray) {
2981 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2982 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2983       }
2984       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2985       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2986     }
2987     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2988       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2989     }
2990     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2991       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2992     }
2993   }
2994   Bdisassembled = PETSC_FALSE;
2995   if (!aij->B) {
2996     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2997     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2998     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2999     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3000     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3001     Bdisassembled = PETSC_TRUE;
3002   }
3003   if (B) {
3004     Baij = (Mat_SeqAIJ*)B->data;
3005     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3006       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3007       for (i=0; i<B->rmap->n; i++) {
3008 	nz[i] = Baij->i[i+1] - Baij->i[i];
3009       }
3010       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3011       ierr = PetscFree(nz);CHKERRQ(ierr);
3012     }
3013 
3014     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3015     shift = rend-rstart;
3016     count = 0;
3017     rowindices = NULL;
3018     colindices = NULL;
3019     if (rowemb) {
3020       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3021     }
3022     if (ocolemb) {
3023       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3024     }
3025     for (i=0; i<B->rmap->n; i++) {
3026       PetscInt row;
3027       row = i;
3028       if (rowindices) row = rowindices[i];
3029       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3030 	col  = Baij->j[count];
3031 	if (colindices) col = colindices[col];
3032 	if (Bdisassembled && col>=rstart) col += shift;
3033 	v    = Baij->a[count];
3034 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3035 	++count;
3036       }
3037     }
3038     /* No assembly for aij->B is necessary. */
3039     /* FIXME: set aij->B's nonzerostate correctly. */
3040   } else {
3041     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3042   }
3043   C->preallocated  = PETSC_TRUE;
3044   C->was_assembled = PETSC_FALSE;
3045   C->assembled     = PETSC_FALSE;
3046    /*
3047       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3048       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3049    */
3050   PetscFunctionReturn(0);
3051 }
3052 
3053 /*
3054   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3055  */
3056 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3057 {
3058   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3059 
3060   PetscFunctionBegin;
3061   PetscValidPointer(A,2);
3062   PetscValidPointer(B,3);
3063   /* FIXME: make sure C is assembled */
3064   *A = aij->A;
3065   *B = aij->B;
3066   /* Note that we don't incref *A and *B, so be careful! */
3067   PetscFunctionReturn(0);
3068 }
3069 
3070 /*
3071   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3072   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3073 */
3074 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3075                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3076 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3077 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3078 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3079 {
3080   PetscErrorCode ierr;
3081   PetscMPIInt    isize,flag;
3082   PetscInt       i,ii,cismax,ispar;
3083   Mat            *A,*B;
3084   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3085 
3086   PetscFunctionBegin;
3087   if (!ismax) PetscFunctionReturn(0);
3088 
3089   for (i = 0, cismax = 0; i < ismax; ++i) {
3090     PetscMPIInt isize;
3091     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3092     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3093     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3094     if (isize > 1) ++cismax;
3095   }
3096 
3097   /*
3098      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3099      ispar counts the number of parallel ISs across C's comm.
3100   */
3101   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3102   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3103     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3104     PetscFunctionReturn(0);
3105   }
3106 
3107   /* if (ispar) */
3108   /*
3109     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3110     These are used to extract the off-diag portion of the resulting parallel matrix.
3111     The row IS for the off-diag portion is the same as for the diag portion,
3112     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3113   */
3114   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3115   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3116   for (i = 0, ii = 0; i < ismax; ++i) {
3117     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3118     if (isize > 1) {
3119       /*
3120 	 TODO: This is the part that's ***NOT SCALABLE***.
3121 	 To fix this we need to extract just the indices of C's nonzero columns
3122 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3123 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3124 	 to be done without serializing on the IS list, so, most likely, it is best
3125 	 done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3126       */
3127       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3128       /* Now we have to
3129 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3130 	     were sorted on each rank, concatenated they might no longer be sorted;
3131 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3132 	     indices in the nondecreasing order to the original index positions.
3133 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3134       */
3135       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3136       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3137       ++ii;
3138     }
3139   }
3140   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3141   for (i = 0, ii = 0; i < ismax; ++i) {
3142     PetscInt       j,issize;
3143     const PetscInt *indices;
3144 
3145     /*
3146        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3147      */
3148     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3149     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3150     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3151     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3152     for (j = 1; j < issize; ++j) {
3153       if (indices[j] == indices[j-1]) {
3154 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3155       }
3156     }
3157     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3158 
3159 
3160     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3161     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3162     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3163     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3164     for (j = 1; j < issize; ++j) {
3165       if (indices[j-1] == indices[j]) {
3166 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3167       }
3168     }
3169     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3170     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3171     if (isize > 1) {
3172       cisrow[ii] = isrow[i];
3173       ++ii;
3174     }
3175   }
3176   /*
3177     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3178     array of sequential matrices underlying the resulting parallel matrices.
3179     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3180     contain duplicates.
3181 
3182     There are as many diag matrices as there are original index sets. There are only as many parallel
3183     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3184 
3185     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3186     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3187       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3188       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3189       with A[i] and B[ii] extracted from the corresponding MPI submat.
3190     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3191       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3192       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3193       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3194       values into A[i] and B[ii] sitting inside the corresponding submat.
3195     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3196       A[i], B[ii], AA[i] or BB[ii] matrices.
3197   */
3198   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3199   if (scall == MAT_INITIAL_MATRIX) {
3200     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3201   }
3202 
3203   /* Now obtain the sequential A and B submatrices separately. */
3204   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3205   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3206   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3207 
3208   /*
3209     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3210     matrices A & B have been extracted directly into the parallel matrices containing them, or
3211     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3212     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3213     to have the same sparsity pattern.
3214     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3215     must be constructed for C. This is done by setseqmat(s).
3216   */
3217   for (i = 0, ii = 0; i < ismax; ++i) {
3218     /*
3219        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3220        That way we can avoid sorting and computing permutations when reusing.
3221        To this end:
3222         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3223 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3224 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3225     */
3226     MatStructure pattern;
3227     pattern = DIFFERENT_NONZERO_PATTERN;
3228 
3229     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3230     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3231     if (isize > 1) {
3232       if (scall == MAT_INITIAL_MATRIX) {
3233 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3234 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3235 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3236 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3237 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3238       }
3239       /*
3240 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3241       */
3242       {
3243 	Mat AA,BB;
3244         AA = A[i];
3245         BB = B[ii];
3246 	if (AA || BB) {
3247 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3248 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3249 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3250 	}
3251 
3252         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3253       }
3254       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3255       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3256       ++ii;
3257     } else { /* if (isize == 1) */
3258       if (scall == MAT_REUSE_MATRIX) {
3259         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3260       }
3261       if (isrow_p[i] || iscol_p[i]) {
3262         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3263         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3264 	/* Otherwise A is extracted straight into (*submats)[i]. */
3265 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3266 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3267       } else (*submat)[i] = A[i];
3268     }
3269     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3270     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3271   }
3272   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3273   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3274   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3275   ierr = PetscFree(A);CHKERRQ(ierr);
3276   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3277   PetscFunctionReturn(0);
3278 }
3279 
3280 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3281 {
3282   PetscErrorCode ierr;
3283 
3284   PetscFunctionBegin;
3285   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3286   PetscFunctionReturn(0);
3287 }
3288