xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision cdb0f33d09c128f365fdb48a6f07c56e211b6a43)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++) {
82     length_i  = length[i];
83     indices_i = indices[i];
84     for (j=0; j<length_i; j++) {
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank) {
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows) {
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++) {
115     if (tosizes_temp[i]) {
116       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117       tosizes_temp[i] = nto; /* a map from processor to index */
118       toranks[nto++]  = i; /* processor */
119     }
120   }
121   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   toffsets[0] = 0;
123   for (i=0; i<nto; i++) {
124     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
125     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
126   }
127   /* send information to other processors */
128   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
129   nrecvrows = 0;
130   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
131   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
132   nrecvrows = 0;
133   for (i=0; i<nfrom; i++) {
134     for (j=0; j<fromsizes[2*i]; j++) {
135       remote[nrecvrows].rank    = fromranks[i];
136       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
137     }
138   }
139   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
140   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
141   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
142   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
143   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
144   /* message pair <no of is, row>  */
145   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
146   for (i=0; i<nrrows; i++) {
147     owner = rrow_ranks[i]; /* processor */
148     j     = tosizes_temp[owner]; /* index */
149     todata[toffsets[j]++] = rrow_isids[i];
150     todata[toffsets[j]++] = remoterows[i];
151   }
152   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
153   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
154   ierr = PetscFree(toffsets);CHKERRQ(ierr);
155   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
157   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
158   /* send rows belonging to the remote so that then we could get the overlapping data back */
159   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
160   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
161   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
162   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
163   ierr = PetscFree(fromranks);CHKERRQ(ierr);
164   nrecvrows = 0;
165   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
166   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
167   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
168   nrecvrows = 0;
169   for (i=0; i<nto; i++) {
170     for (j=0; j<tosizes[2*i]; j++) {
171       remote[nrecvrows].rank    = toranks[i];
172       remote[nrecvrows++].index = tosizes[2*i+1]+j;
173     }
174   }
175   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
176   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
177   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
178   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
179   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
180   /* overlap communication and computation */
181   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
182   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
183   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
184   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
185   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
186   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
187   ierr = PetscFree(toranks);CHKERRQ(ierr);
188   ierr = PetscFree(tosizes);CHKERRQ(ierr);
189   ierr = PetscFree(todata);CHKERRQ(ierr);
190   PetscFunctionReturn(0);
191 }
192 
193 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
194 {
195   PetscInt         *isz,isz_i,i,j,is_id, data_size;
196   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
197   const PetscInt   *indices_i_temp;
198   MPI_Comm         *iscomms;
199   PetscErrorCode    ierr;
200 
201   PetscFunctionBegin;
202   max_lsize = 0;
203   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
204   for (i=0; i<nidx; i++){
205     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
206     max_lsize = lsize>max_lsize ? lsize:max_lsize;
207     isz[i]    = lsize;
208   }
209   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
212     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
214     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
215     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
216   }
217   /* retrieve information to get row id and its overlap */
218   for (i=0; i<nrecvs; ){
219     is_id     = recvdata[i++];
220     data_size = recvdata[i++];
221     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
222     isz_i     = isz[is_id];
223     for (j=0; j< data_size; j++){
224       col = recvdata[i++];
225       indices_i[isz_i++] = col;
226     }
227     isz[is_id] = isz_i;
228   }
229   /* remove duplicate entities */
230   for (i=0; i<nidx; i++){
231     indices_i = indices_temp+(max_lsize+nrecvs)*i;
232     isz_i     = isz[i];
233     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
234     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
235     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
236   }
237   ierr = PetscFree(isz);CHKERRQ(ierr);
238   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
243 {
244   PetscLayout       rmap,cmap;
245   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
246   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
247   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
248   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
249   Mat               amat,bmat;
250   PetscMPIInt       rank;
251   PetscBool         done;
252   MPI_Comm          comm;
253   PetscErrorCode    ierr;
254 
255   PetscFunctionBegin;
256   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
257   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
258   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
259   /* Even if the mat is symmetric, we still assume it is not symmetric */
260   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
263   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
264   /* total number of nonzero values is used to estimate the memory usage in the next step */
265   tnz  = ai[an]+bi[bn];
266   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
267   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
268   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
269   /* to find the longest message */
270   max_fszs = 0;
271   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
272   /* better way to estimate number of nonzero in the mat??? */
273   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
274   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
275   rows_pos  = 0;
276   totalrows = 0;
277   for (i=0; i<nfrom; i++){
278     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
279     /* group data together */
280     for (j=0; j<fromsizes[2*i]; j+=2){
281       is_id                       = fromrows[rows_pos++];/* no of is */
282       rows_i                      = rows_data[is_id];
283       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
284     }
285     /* estimate a space to avoid multiple allocations  */
286     for (j=0; j<nidx; j++){
287       indvc_ij = 0;
288       rows_i   = rows_data[j];
289       for (l=0; l<rows_pos_i[j]; l++){
290         row    = rows_i[l]-rstart;
291         start  = ai[row];
292         end    = ai[row+1];
293         for (k=start; k<end; k++){ /* Amat */
294           col = aj[k] + cstart;
295           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
296         }
297         start = bi[row];
298         end   = bi[row+1];
299         for (k=start; k<end; k++) { /* Bmat */
300           col = gcols[bj[k]];
301           indices_tmp[indvc_ij++] = col;
302         }
303       }
304       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
305       indv_counts[i*nidx+j] = indvc_ij;
306       totalrows            += indvc_ij;
307     }
308   }
309   /* message triple <no of is, number of rows, rows> */
310   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
311   totalrows = 0;
312   rows_pos  = 0;
313   /* use this code again */
314   for (i=0;i<nfrom;i++){
315     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
316     for (j=0; j<fromsizes[2*i]; j+=2){
317       is_id                       = fromrows[rows_pos++];
318       rows_i                      = rows_data[is_id];
319       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
320     }
321     /* add data  */
322     for (j=0; j<nidx; j++){
323       if (!indv_counts[i*nidx+j]) continue;
324       indvc_ij = 0;
325       sbdata[totalrows++] = j;
326       sbdata[totalrows++] = indv_counts[i*nidx+j];
327       sbsizes[2*i]       += 2;
328       rows_i              = rows_data[j];
329       for (l=0; l<rows_pos_i[j]; l++){
330         row   = rows_i[l]-rstart;
331         start = ai[row];
332         end   = ai[row+1];
333         for (k=start; k<end; k++){ /* Amat */
334           col = aj[k] + cstart;
335           indices_tmp[indvc_ij++] = col;
336         }
337         start = bi[row];
338         end   = bi[row+1];
339         for (k=start; k<end; k++) { /* Bmat */
340           col = gcols[bj[k]];
341           indices_tmp[indvc_ij++] = col;
342         }
343       }
344       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
345       sbsizes[2*i]  += indvc_ij;
346       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
347       totalrows += indvc_ij;
348     }
349   }
350   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
351   offsets[0] = 0;
352   for (i=0; i<nfrom; i++){
353     offsets[i+1]   = offsets[i] + sbsizes[2*i];
354     sbsizes[2*i+1] = offsets[i];
355   }
356   ierr = PetscFree(offsets);CHKERRQ(ierr);
357   if (sbrowsizes) *sbrowsizes = sbsizes;
358   if (sbrows) *sbrows = sbdata;
359   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
360   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
361   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
366 {
367   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
368   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
369   PetscInt          lsize,lsize_tmp;
370   PetscMPIInt       rank,owner;
371   Mat               amat,bmat;
372   PetscBool         done;
373   PetscLayout       cmap,rmap;
374   MPI_Comm          comm;
375   PetscErrorCode    ierr;
376 
377   PetscFunctionBegin;
378   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
379   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
380   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
381   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
384   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
385   /* is it a safe way to compute number of nonzero values ? */
386   tnz  = ai[an]+bi[bn];
387   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
388   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
389   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
390   /* it is a better way to estimate memory than the old implementation
391    * where global size of matrix is used
392    * */
393   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
394   for (i=0; i<nidx; i++) {
395     MPI_Comm iscomm;
396 
397     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
398     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
399     lsize_tmp = 0;
400     for (j=0; j<lsize; j++) {
401       owner = -1;
402       row   = indices[j];
403       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
404       if (owner != rank) continue;
405       /* local number */
406       row  -= rstart;
407       start = ai[row];
408       end   = ai[row+1];
409       for (k=start; k<end; k++) { /* Amat */
410         col = aj[k] + cstart;
411         indices_temp[lsize_tmp++] = col;
412       }
413       start = bi[row];
414       end   = bi[row+1];
415       for (k=start; k<end; k++) { /* Bmat */
416         col = gcols[bj[k]];
417         indices_temp[lsize_tmp++] = col;
418       }
419     }
420    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
421    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
422    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
423    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
424    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
425    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
426   }
427   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
428   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
429   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
430   PetscFunctionReturn(0);
431 }
432 
433 
434 /*
435   Sample message format:
436   If a processor A wants processor B to process some elements corresponding
437   to index sets is[1],is[5]
438   mesg [0] = 2   (no of index sets in the mesg)
439   -----------
440   mesg [1] = 1 => is[1]
441   mesg [2] = sizeof(is[1]);
442   -----------
443   mesg [3] = 5  => is[5]
444   mesg [4] = sizeof(is[5]);
445   -----------
446   mesg [5]
447   mesg [n]  datas[1]
448   -----------
449   mesg[n+1]
450   mesg[m]  data(is[5])
451   -----------
452 
453   Notes:
454   nrqs - no of requests sent (or to be sent out)
455   nrqr - no of requests received (which have to be or which have been processed)
456 */
457 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
458 {
459   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
460   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
461   const PetscInt **idx,*idx_i;
462   PetscInt       *n,**data,len;
463 #if defined(PETSC_USE_CTABLE)
464   PetscTable     *table_data,table_data_i;
465   PetscInt       *tdata,tcount,tcount_max;
466 #else
467   PetscInt       *data_i,*d_p;
468 #endif
469   PetscErrorCode ierr;
470   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
471   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
472   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
473   PetscBT        *table;
474   MPI_Comm       comm;
475   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476   MPI_Status     *s_status,*recv_status;
477   MPI_Comm       *iscomms;
478   char           *t_p;
479 
480   PetscFunctionBegin;
481   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
482   size = c->size;
483   rank = c->rank;
484   M    = C->rmap->N;
485 
486   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
487   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
488 
489   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
490 
491   for (i=0; i<imax; i++) {
492     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
493     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
494   }
495 
496   /* evaluate communication - mesg to who,length of mesg, and buffer space
497      required. Based on this, buffers are allocated, and data copied into them  */
498   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
499   for (i=0; i<imax; i++) {
500     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
501     idx_i = idx[i];
502     len   = n[i];
503     for (j=0; j<len; j++) {
504       row = idx_i[j];
505       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
506       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
507       w4[proc]++;
508     }
509     for (j=0; j<size; j++) {
510       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
511     }
512   }
513 
514   nrqs     = 0;              /* no of outgoing messages */
515   msz      = 0;              /* total mesg length (for all proc */
516   w1[rank] = 0;              /* no mesg sent to intself */
517   w3[rank] = 0;
518   for (i=0; i<size; i++) {
519     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
520   }
521   /* pa - is list of processors to communicate with */
522   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
523   for (i=0,j=0; i<size; i++) {
524     if (w1[i]) {pa[j] = i; j++;}
525   }
526 
527   /* Each message would have a header = 1 + 2*(no of IS) + data */
528   for (i=0; i<nrqs; i++) {
529     j      = pa[i];
530     w1[j] += w2[j] + 2*w3[j];
531     msz   += w1[j];
532   }
533 
534   /* Determine the number of messages to expect, their lengths, from from-ids */
535   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
536   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
537 
538   /* Now post the Irecvs corresponding to these messages */
539   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
540 
541   /* Allocate Memory for outgoing messages */
542   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
543   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
544   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
545 
546   {
547     PetscInt *iptr = tmp,ict  = 0;
548     for (i=0; i<nrqs; i++) {
549       j         = pa[i];
550       iptr     +=  ict;
551       outdat[j] = iptr;
552       ict       = w1[j];
553     }
554   }
555 
556   /* Form the outgoing messages */
557   /* plug in the headers */
558   for (i=0; i<nrqs; i++) {
559     j            = pa[i];
560     outdat[j][0] = 0;
561     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
562     ptr[j]       = outdat[j] + 2*w3[j] + 1;
563   }
564 
565   /* Memory for doing local proc's work */
566   {
567     PetscInt M_BPB_imax = 0;
568 #if defined(PETSC_USE_CTABLE)
569     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
570     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
571     for (i=0; i<imax; i++) {
572       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
573     }
574     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
575     for (i=0; i<imax; i++) {
576       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
577     }
578 #else
579     PetscInt Mimax = 0;
580     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
581     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
582     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
583     for (i=0; i<imax; i++) {
584       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
585       data[i]  = d_p + M*i;
586     }
587 #endif
588   }
589 
590   /* Parse the IS and update local tables and the outgoing buf with the data */
591   {
592     PetscInt n_i,isz_i,*outdat_j,ctr_j;
593     PetscBT  table_i;
594 
595     for (i=0; i<imax; i++) {
596       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
597       n_i     = n[i];
598       table_i = table[i];
599       idx_i   = idx[i];
600 #if defined(PETSC_USE_CTABLE)
601       table_data_i = table_data[i];
602 #else
603       data_i  = data[i];
604 #endif
605       isz_i   = isz[i];
606       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
607         row  = idx_i[j];
608         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
609         if (proc != rank) { /* copy to the outgoing buffer */
610           ctr[proc]++;
611           *ptr[proc] = row;
612           ptr[proc]++;
613         } else if (!PetscBTLookupSet(table_i,row)) {
614 #if defined(PETSC_USE_CTABLE)
615           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
616 #else
617           data_i[isz_i] = row; /* Update the local table */
618 #endif
619           isz_i++;
620         }
621       }
622       /* Update the headers for the current IS */
623       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
624         if ((ctr_j = ctr[j])) {
625           outdat_j        = outdat[j];
626           k               = ++outdat_j[0];
627           outdat_j[2*k]   = ctr_j;
628           outdat_j[2*k-1] = i;
629         }
630       }
631       isz[i] = isz_i;
632     }
633   }
634 
635   /*  Now  post the sends */
636   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
637   for (i=0; i<nrqs; ++i) {
638     j    = pa[i];
639     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
640   }
641 
642   /* No longer need the original indices */
643   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
644   for (i=0; i<imax; ++i) {
645     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
646     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
647   }
648   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
649 
650   for (i=0; i<imax; ++i) {
651     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
652   }
653 
654   /* Do Local work */
655 #if defined(PETSC_USE_CTABLE)
656   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
657 #else
658   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
659 #endif
660 
661   /* Receive messages */
662   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
663   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
664 
665   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
666   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
667 
668   /* Phase 1 sends are complete - deallocate buffers */
669   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
670   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
671 
672   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
673   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
674   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
675   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
676   ierr = PetscFree(rbuf);CHKERRQ(ierr);
677 
678 
679   /* Send the data back */
680   /* Do a global reduction to know the buffer space req for incoming messages */
681   {
682     PetscMPIInt *rw1;
683 
684     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
685 
686     for (i=0; i<nrqr; ++i) {
687       proc = recv_status[i].MPI_SOURCE;
688 
689       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
690       rw1[proc] = isz1[i];
691     }
692     ierr = PetscFree(onodes1);CHKERRQ(ierr);
693     ierr = PetscFree(olengths1);CHKERRQ(ierr);
694 
695     /* Determine the number of messages to expect, their lengths, from from-ids */
696     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
697     ierr = PetscFree(rw1);CHKERRQ(ierr);
698   }
699   /* Now post the Irecvs corresponding to these messages */
700   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
701 
702   /* Now  post the sends */
703   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
704   for (i=0; i<nrqr; ++i) {
705     j    = recv_status[i].MPI_SOURCE;
706     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
707   }
708 
709   /* receive work done on other processors */
710   {
711     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
712     PetscMPIInt idex;
713     PetscBT     table_i;
714     MPI_Status  *status2;
715 
716     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
717     for (i=0; i<nrqs; ++i) {
718       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
719       /* Process the message */
720       rbuf2_i = rbuf2[idex];
721       ct1     = 2*rbuf2_i[0]+1;
722       jmax    = rbuf2[idex][0];
723       for (j=1; j<=jmax; j++) {
724         max     = rbuf2_i[2*j];
725         is_no   = rbuf2_i[2*j-1];
726         isz_i   = isz[is_no];
727         table_i = table[is_no];
728 #if defined(PETSC_USE_CTABLE)
729         table_data_i = table_data[is_no];
730 #else
731         data_i  = data[is_no];
732 #endif
733         for (k=0; k<max; k++,ct1++) {
734           row = rbuf2_i[ct1];
735           if (!PetscBTLookupSet(table_i,row)) {
736 #if defined(PETSC_USE_CTABLE)
737             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
738 #else
739             data_i[isz_i] = row;
740 #endif
741             isz_i++;
742           }
743         }
744         isz[is_no] = isz_i;
745       }
746     }
747 
748     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
749     ierr = PetscFree(status2);CHKERRQ(ierr);
750   }
751 
752 #if defined(PETSC_USE_CTABLE)
753   tcount_max = 0;
754   for (i=0; i<imax; ++i) {
755     table_data_i = table_data[i];
756     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
757     if (tcount_max < tcount) tcount_max = tcount;
758   }
759   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
760 #endif
761 
762   for (i=0; i<imax; ++i) {
763 #if defined(PETSC_USE_CTABLE)
764     PetscTablePosition tpos;
765     table_data_i = table_data[i];
766 
767     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
768     while (tpos) {
769       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
770       tdata[--j] = --k;
771     }
772     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
773 #else
774     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
775 #endif
776     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
777   }
778 
779   ierr = PetscFree(iscomms);CHKERRQ(ierr);
780   ierr = PetscFree(onodes2);CHKERRQ(ierr);
781   ierr = PetscFree(olengths2);CHKERRQ(ierr);
782 
783   ierr = PetscFree(pa);CHKERRQ(ierr);
784   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
785   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
786   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
787   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
788   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
789   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
790   ierr = PetscFree(s_status);CHKERRQ(ierr);
791   ierr = PetscFree(recv_status);CHKERRQ(ierr);
792   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
793   ierr = PetscFree(xdata);CHKERRQ(ierr);
794   ierr = PetscFree(isz1);CHKERRQ(ierr);
795 #if defined(PETSC_USE_CTABLE)
796   for (i=0; i<imax; i++) {
797     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
798   }
799   ierr = PetscFree(table_data);CHKERRQ(ierr);
800   ierr = PetscFree(tdata);CHKERRQ(ierr);
801   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
802 #else
803   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
804 #endif
805   PetscFunctionReturn(0);
806 }
807 
808 /*
809    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
810        the work on the local processor.
811 
812      Inputs:
813       C      - MAT_MPIAIJ;
814       imax - total no of index sets processed at a time;
815       table  - an array of char - size = m bits.
816 
817      Output:
818       isz    - array containing the count of the solution elements corresponding
819                to each index set;
820       data or table_data  - pointer to the solutions
821 */
822 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
823 {
824   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
825   Mat        A  = c->A,B = c->B;
826   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
827   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
828   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
829   PetscBT    table_i;
830 #if defined(PETSC_USE_CTABLE)
831   PetscTable         table_data_i;
832   PetscErrorCode     ierr;
833   PetscTablePosition tpos;
834   PetscInt           tcount,*tdata;
835 #else
836   PetscInt           *data_i;
837 #endif
838 
839   PetscFunctionBegin;
840   rstart = C->rmap->rstart;
841   cstart = C->cmap->rstart;
842   ai     = a->i;
843   aj     = a->j;
844   bi     = b->i;
845   bj     = b->j;
846   garray = c->garray;
847 
848   for (i=0; i<imax; i++) {
849 #if defined(PETSC_USE_CTABLE)
850     /* copy existing entries of table_data_i into tdata[] */
851     table_data_i = table_data[i];
852     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
853     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
854 
855     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
856     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
857     while (tpos) {
858       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
859       tdata[--j] = --row;
860       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
861     }
862 #else
863     data_i  = data[i];
864 #endif
865     table_i = table[i];
866     isz_i   = isz[i];
867     max     = isz[i];
868 
869     for (j=0; j<max; j++) {
870 #if defined(PETSC_USE_CTABLE)
871       row   = tdata[j] - rstart;
872 #else
873       row   = data_i[j] - rstart;
874 #endif
875       start = ai[row];
876       end   = ai[row+1];
877       for (k=start; k<end; k++) { /* Amat */
878         val = aj[k] + cstart;
879         if (!PetscBTLookupSet(table_i,val)) {
880 #if defined(PETSC_USE_CTABLE)
881           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
882 #else
883           data_i[isz_i] = val;
884 #endif
885           isz_i++;
886         }
887       }
888       start = bi[row];
889       end   = bi[row+1];
890       for (k=start; k<end; k++) { /* Bmat */
891         val = garray[bj[k]];
892         if (!PetscBTLookupSet(table_i,val)) {
893 #if defined(PETSC_USE_CTABLE)
894           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
895 #else
896           data_i[isz_i] = val;
897 #endif
898           isz_i++;
899         }
900       }
901     }
902     isz[i] = isz_i;
903 
904 #if defined(PETSC_USE_CTABLE)
905     ierr = PetscFree(tdata);CHKERRQ(ierr);
906 #endif
907   }
908   PetscFunctionReturn(0);
909 }
910 
911 /*
912       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
913          and return the output
914 
915          Input:
916            C    - the matrix
917            nrqr - no of messages being processed.
918            rbuf - an array of pointers to the received requests
919 
920          Output:
921            xdata - array of messages to be sent back
922            isz1  - size of each message
923 
924   For better efficiency perhaps we should malloc separately each xdata[i],
925 then if a remalloc is required we need only copy the data for that one row
926 rather then all previous rows as it is now where a single large chunck of
927 memory is used.
928 
929 */
930 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
931 {
932   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
933   Mat            A  = c->A,B = c->B;
934   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
935   PetscErrorCode ierr;
936   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
937   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
938   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
939   PetscInt       *rbuf_i,kmax,rbuf_0;
940   PetscBT        xtable;
941 
942   PetscFunctionBegin;
943   m      = C->rmap->N;
944   rstart = C->rmap->rstart;
945   cstart = C->cmap->rstart;
946   ai     = a->i;
947   aj     = a->j;
948   bi     = b->i;
949   bj     = b->j;
950   garray = c->garray;
951 
952 
953   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
954     rbuf_i =  rbuf[i];
955     rbuf_0 =  rbuf_i[0];
956     ct    += rbuf_0;
957     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
958   }
959 
960   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
961   else max1 = 1;
962   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
963   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
964   ++no_malloc;
965   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
966   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
967 
968   ct3 = 0;
969   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
970     rbuf_i =  rbuf[i];
971     rbuf_0 =  rbuf_i[0];
972     ct1    =  2*rbuf_0+1;
973     ct2    =  ct1;
974     ct3   += ct1;
975     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
976       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
977       oct2 = ct2;
978       kmax = rbuf_i[2*j];
979       for (k=0; k<kmax; k++,ct1++) {
980         row = rbuf_i[ct1];
981         if (!PetscBTLookupSet(xtable,row)) {
982           if (!(ct3 < mem_estimate)) {
983             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
984             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
985             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
986             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
987             xdata[0]     = tmp;
988             mem_estimate = new_estimate; ++no_malloc;
989             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
990           }
991           xdata[i][ct2++] = row;
992           ct3++;
993         }
994       }
995       for (k=oct2,max2=ct2; k<max2; k++) {
996         row   = xdata[i][k] - rstart;
997         start = ai[row];
998         end   = ai[row+1];
999         for (l=start; l<end; l++) {
1000           val = aj[l] + cstart;
1001           if (!PetscBTLookupSet(xtable,val)) {
1002             if (!(ct3 < mem_estimate)) {
1003               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1004               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1005               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1006               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1007               xdata[0]     = tmp;
1008               mem_estimate = new_estimate; ++no_malloc;
1009               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1010             }
1011             xdata[i][ct2++] = val;
1012             ct3++;
1013           }
1014         }
1015         start = bi[row];
1016         end   = bi[row+1];
1017         for (l=start; l<end; l++) {
1018           val = garray[bj[l]];
1019           if (!PetscBTLookupSet(xtable,val)) {
1020             if (!(ct3 < mem_estimate)) {
1021               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1022               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1023               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1024               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1025               xdata[0]     = tmp;
1026               mem_estimate = new_estimate; ++no_malloc;
1027               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1028             }
1029             xdata[i][ct2++] = val;
1030             ct3++;
1031           }
1032         }
1033       }
1034       /* Update the header*/
1035       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1036       xdata[i][2*j-1] = rbuf_i[2*j-1];
1037     }
1038     xdata[i][0] = rbuf_0;
1039     xdata[i+1]  = xdata[i] + ct2;
1040     isz1[i]     = ct2; /* size of each message */
1041   }
1042   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1043   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1044   PetscFunctionReturn(0);
1045 }
1046 /* -------------------------------------------------------------------------*/
1047 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1048 /*
1049     Every processor gets the entire matrix
1050 */
1051 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1052 {
1053   Mat            B;
1054   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1055   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1056   PetscErrorCode ierr;
1057   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1058   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1059   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1060   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1061 
1062   PetscFunctionBegin;
1063   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1064   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1065 
1066   if (scall == MAT_INITIAL_MATRIX) {
1067     /* ----------------------------------------------------------------
1068          Tell every processor the number of nonzeros per row
1069     */
1070     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1071     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1072       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1073     }
1074     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1075     for (i=0; i<size; i++) {
1076       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1077       displs[i]     = A->rmap->range[i];
1078     }
1079 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1080     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1081 #else
1082     sendcount = A->rmap->rend - A->rmap->rstart;
1083     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1084 #endif
1085     /* ---------------------------------------------------------------
1086          Create the sequential matrix of the same type as the local block diagonal
1087     */
1088     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1089     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1090     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1091     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1092     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1093     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1094     **Bin = B;
1095     b     = (Mat_SeqAIJ*)B->data;
1096 
1097     /*--------------------------------------------------------------------
1098        Copy my part of matrix column indices over
1099     */
1100     sendcount  = ad->nz + bd->nz;
1101     jsendbuf   = b->j + b->i[rstarts[rank]];
1102     a_jsendbuf = ad->j;
1103     b_jsendbuf = bd->j;
1104     n          = A->rmap->rend - A->rmap->rstart;
1105     cnt        = 0;
1106     for (i=0; i<n; i++) {
1107 
1108       /* put in lower diagonal portion */
1109       m = bd->i[i+1] - bd->i[i];
1110       while (m > 0) {
1111         /* is it above diagonal (in bd (compressed) numbering) */
1112         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1113         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1114         m--;
1115       }
1116 
1117       /* put in diagonal portion */
1118       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1119         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1120       }
1121 
1122       /* put in upper diagonal portion */
1123       while (m-- > 0) {
1124         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1125       }
1126     }
1127     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1128 
1129     /*--------------------------------------------------------------------
1130        Gather all column indices to all processors
1131     */
1132     for (i=0; i<size; i++) {
1133       recvcounts[i] = 0;
1134       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1135         recvcounts[i] += lens[j];
1136       }
1137     }
1138     displs[0] = 0;
1139     for (i=1; i<size; i++) {
1140       displs[i] = displs[i-1] + recvcounts[i-1];
1141     }
1142 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1143     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1144 #else
1145     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1146 #endif
1147     /*--------------------------------------------------------------------
1148         Assemble the matrix into useable form (note numerical values not yet set)
1149     */
1150     /* set the b->ilen (length of each row) values */
1151     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1152     /* set the b->i indices */
1153     b->i[0] = 0;
1154     for (i=1; i<=A->rmap->N; i++) {
1155       b->i[i] = b->i[i-1] + lens[i-1];
1156     }
1157     ierr = PetscFree(lens);CHKERRQ(ierr);
1158     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1159     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1160 
1161   } else {
1162     B = **Bin;
1163     b = (Mat_SeqAIJ*)B->data;
1164   }
1165 
1166   /*--------------------------------------------------------------------
1167        Copy my part of matrix numerical values into the values location
1168   */
1169   if (flag == MAT_GET_VALUES) {
1170     sendcount = ad->nz + bd->nz;
1171     sendbuf   = b->a + b->i[rstarts[rank]];
1172     a_sendbuf = ad->a;
1173     b_sendbuf = bd->a;
1174     b_sendj   = bd->j;
1175     n         = A->rmap->rend - A->rmap->rstart;
1176     cnt       = 0;
1177     for (i=0; i<n; i++) {
1178 
1179       /* put in lower diagonal portion */
1180       m = bd->i[i+1] - bd->i[i];
1181       while (m > 0) {
1182         /* is it above diagonal (in bd (compressed) numbering) */
1183         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1184         sendbuf[cnt++] = *b_sendbuf++;
1185         m--;
1186         b_sendj++;
1187       }
1188 
1189       /* put in diagonal portion */
1190       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1191         sendbuf[cnt++] = *a_sendbuf++;
1192       }
1193 
1194       /* put in upper diagonal portion */
1195       while (m-- > 0) {
1196         sendbuf[cnt++] = *b_sendbuf++;
1197         b_sendj++;
1198       }
1199     }
1200     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1201 
1202     /* -----------------------------------------------------------------
1203        Gather all numerical values to all processors
1204     */
1205     if (!recvcounts) {
1206       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1207     }
1208     for (i=0; i<size; i++) {
1209       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1210     }
1211     displs[0] = 0;
1212     for (i=1; i<size; i++) {
1213       displs[i] = displs[i-1] + recvcounts[i-1];
1214     }
1215     recvbuf = b->a;
1216 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1217     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1218 #else
1219     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1220 #endif
1221   }  /* endof (flag == MAT_GET_VALUES) */
1222   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1223 
1224   if (A->symmetric) {
1225     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1226   } else if (A->hermitian) {
1227     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1228   } else if (A->structurally_symmetric) {
1229     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1230   }
1231   PetscFunctionReturn(0);
1232 }
1233 
1234 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1235 {
1236   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1237   Mat            submat,A = c->A,B = c->B;
1238   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1239   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1240   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1241   const PetscInt *icol,*irow;
1242   PetscInt       nrow,ncol,start;
1243   PetscErrorCode ierr;
1244   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1245   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1246   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1247   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1248   PetscInt       *lens,rmax,ncols,*cols,Crow;
1249 #if defined(PETSC_USE_CTABLE)
1250   PetscTable     cmap,rmap;
1251   PetscInt       *cmap_loc,*rmap_loc;
1252 #else
1253   PetscInt       *cmap,*rmap;
1254 #endif
1255   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1256   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1257   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1258   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1259   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1260   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1261   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1262   MPI_Comm       comm;
1263   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1264   PetscMPIInt    *onodes1,*olengths1,idex,end;
1265   Mat_SubSppt    *smatis1;
1266   PetscBool      isrowsorted,iscolsorted;
1267 
1268   PetscFunctionBegin;
1269   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1270 
1271   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1272   size = c->size;
1273   rank = c->rank;
1274 
1275   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1276   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1277   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1278   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1279   if (allcolumns) {
1280     icol = NULL;
1281     ncol = C->cmap->N;
1282   } else {
1283     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1284     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1285   }
1286 
1287   if (scall == MAT_INITIAL_MATRIX) {
1288     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1289 
1290     /* Get some new tags to keep the communication clean */
1291     tag1 = ((PetscObject)C)->tag;
1292     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1293     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1294 
1295     /* evaluate communication - mesg to who, length of mesg, and buffer space
1296      required. Based on this, buffers are allocated, and data copied into them */
1297     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1298     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1299 
1300     /* w1[proc] = num of rows owned by proc -- to be requested */
1301     proc = 0;
1302     nrqs = 0; /* num of outgoing messages */
1303     for (j=0; j<nrow; j++) {
1304       row  = irow[j];
1305       if (!isrowsorted) proc = 0;
1306       while (row >= C->rmap->range[proc+1]) proc++;
1307       w1[proc]++;
1308       row2proc[j] = proc; /* map row index to proc */
1309 
1310       if (proc != rank && !w2[proc]) {
1311         w2[proc] = 1; nrqs++;
1312       }
1313     }
1314     w1[rank] = 0;  /* rows owned by self will not be requested */
1315 
1316     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1317     for (proc=0,j=0; proc<size; proc++) {
1318       if (w1[proc]) { pa[j++] = proc;}
1319     }
1320 
1321     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1322     msz = 0;              /* total mesg length (for all procs) */
1323     for (i=0; i<nrqs; i++) {
1324       proc      = pa[i];
1325       w1[proc] += 3;
1326       msz      += w1[proc];
1327     }
1328     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1329 
1330     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1331     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1332     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1333 
1334     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1335        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1336     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1337 
1338     /* Now post the Irecvs corresponding to these messages */
1339     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1340 
1341     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1342     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1343 
1344     /* Allocate Memory for outgoing messages */
1345     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1346     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1347     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1348 
1349     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1350     iptr = tmp;
1351     for (i=0; i<nrqs; i++) {
1352       proc        = pa[i];
1353       sbuf1[proc] = iptr;
1354       iptr       += w1[proc];
1355     }
1356 
1357     /* Form the outgoing messages */
1358     /* Initialize the header space */
1359     for (i=0; i<nrqs; i++) {
1360       proc      = pa[i];
1361       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1362       ptr[proc] = sbuf1[proc] + 3;
1363     }
1364 
1365     /* Parse the isrow and copy data into outbuf */
1366     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1367     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1368       proc = row2proc[j];
1369       if (proc != rank) { /* copy to the outgoing buf*/
1370         *ptr[proc] = irow[j];
1371         ctr[proc]++; ptr[proc]++;
1372       }
1373     }
1374 
1375     /* Update the headers for the current IS */
1376     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1377       if ((ctr_j = ctr[j])) {
1378         sbuf1_j        = sbuf1[j];
1379         k              = ++sbuf1_j[0];
1380         sbuf1_j[2*k]   = ctr_j;
1381         sbuf1_j[2*k-1] = 0;
1382       }
1383     }
1384 
1385     /* Now post the sends */
1386     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1387     for (i=0; i<nrqs; ++i) {
1388       proc = pa[i];
1389       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1390     }
1391 
1392     /* Post Receives to capture the buffer size */
1393     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1394     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1395 
1396     rbuf2[0] = tmp + msz;
1397     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1398 
1399     for (i=0; i<nrqs; ++i) {
1400       proc = pa[i];
1401       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1402     }
1403 
1404     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1405 
1406     /* Send to other procs the buf size they should allocate */
1407     /* Receive messages*/
1408     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1409     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1410 
1411     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1412     for (i=0; i<nrqr; ++i) {
1413       req_size[i] = 0;
1414       rbuf1_i        = rbuf1[i];
1415       start          = 2*rbuf1_i[0] + 1;
1416       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1417       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1418       sbuf2_i        = sbuf2[i];
1419       for (j=start; j<end; j++) {
1420         k            = rbuf1_i[j] - rstart;
1421         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1422         sbuf2_i[j]   = ncols;
1423         req_size[i] += ncols;
1424       }
1425       req_source1[i] = r_status1[i].MPI_SOURCE;
1426 
1427       /* form the header */
1428       sbuf2_i[0] = req_size[i];
1429       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1430 
1431       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1432     }
1433 
1434     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1435     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1436 
1437     /* rbuf2 is received, Post recv column indices a->j */
1438     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1439 
1440     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1441     for (i=0; i<nrqs; ++i) {
1442       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1443       req_source2[i] = r_status2[i].MPI_SOURCE;
1444       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1445     }
1446 
1447     /* Wait on sends1 and sends2 */
1448     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1449     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1450     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1451     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1452 
1453     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1454     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1455 
1456     /* Now allocate sending buffers for a->j, and send them off */
1457     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1458     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1459     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1460     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1461 
1462     for (i=0; i<nrqr; i++) { /* for each requested message */
1463       rbuf1_i   = rbuf1[i];
1464       sbuf_aj_i = sbuf_aj[i];
1465       ct1       = 2*rbuf1_i[0] + 1;
1466       ct2       = 0;
1467       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1468 
1469       kmax = rbuf1[i][2];
1470       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1471         row    = rbuf1_i[ct1] - rstart;
1472         nzA    = ai[row+1] - ai[row];
1473         nzB    = bi[row+1] - bi[row];
1474         ncols  = nzA + nzB;
1475         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1476 
1477         /* load the column indices for this row into cols*/
1478         cols = sbuf_aj_i + ct2;
1479 
1480         lwrite = 0;
1481         for (l=0; l<nzB; l++) {
1482           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1483         }
1484         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1485         for (l=0; l<nzB; l++) {
1486           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1487         }
1488 
1489         ct2 += ncols;
1490       }
1491       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1492     }
1493 
1494     /* create column map (cmap): global col of C -> local col of submat */
1495 #if defined(PETSC_USE_CTABLE)
1496     if (!allcolumns) {
1497       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1498       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1499       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1500         if (icol[j] >= cstart && icol[j] <cend) {
1501           cmap_loc[icol[j] - cstart] = j+1;
1502         } else { /* use PetscTable for non-local col indices */
1503           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1504         }
1505       }
1506     } else {
1507       cmap     = NULL;
1508       cmap_loc = NULL;
1509     }
1510     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1511 #else
1512     if (!allcolumns) {
1513       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1514       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1515     } else {
1516       cmap = NULL;
1517     }
1518 #endif
1519 
1520     /* Create lens for MatSeqAIJSetPreallocation() */
1521     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1522 
1523     /* Compute lens from local part of C */
1524     for (j=0; j<nrow; j++) {
1525       row  = irow[j];
1526       proc = row2proc[j];
1527       if (proc == rank) {
1528         /* diagonal part A = c->A */
1529         ncols = ai[row-rstart+1] - ai[row-rstart];
1530         cols  = aj + ai[row-rstart];
1531         if (!allcolumns) {
1532           for (k=0; k<ncols; k++) {
1533 #if defined(PETSC_USE_CTABLE)
1534             tcol = cmap_loc[cols[k]];
1535 #else
1536             tcol = cmap[cols[k]+cstart];
1537 #endif
1538             if (tcol) lens[j]++;
1539           }
1540         } else { /* allcolumns */
1541           lens[j] = ncols;
1542         }
1543 
1544         /* off-diagonal part B = c->B */
1545         ncols = bi[row-rstart+1] - bi[row-rstart];
1546         cols  = bj + bi[row-rstart];
1547         if (!allcolumns) {
1548           for (k=0; k<ncols; k++) {
1549 #if defined(PETSC_USE_CTABLE)
1550             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1551 #else
1552             tcol = cmap[bmap[cols[k]]];
1553 #endif
1554             if (tcol) lens[j]++;
1555           }
1556         } else { /* allcolumns */
1557           lens[j] += ncols;
1558         }
1559       }
1560     }
1561 
1562     /* Create row map (rmap): global row of C -> local row of submat */
1563 #if defined(PETSC_USE_CTABLE)
1564     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1565     for (j=0; j<nrow; j++) {
1566       row  = irow[j];
1567       proc = row2proc[j];
1568       if (proc == rank) { /* a local row */
1569         rmap_loc[row - rstart] = j;
1570       } else {
1571         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1572       }
1573     }
1574 #else
1575     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1576     for (j=0; j<nrow; j++) {
1577       rmap[irow[j]] = j;
1578     }
1579 #endif
1580 
1581     /* Update lens from offproc data */
1582     /* recv a->j is done */
1583     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1584     for (i=0; i<nrqs; i++) {
1585       proc    = pa[i];
1586       sbuf1_i = sbuf1[proc];
1587       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1588       ct1     = 2 + 1;
1589       ct2     = 0;
1590       rbuf2_i = rbuf2[i]; /* received length of C->j */
1591       rbuf3_i = rbuf3[i]; /* received C->j */
1592 
1593       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1594       max1   = sbuf1_i[2];
1595       for (k=0; k<max1; k++,ct1++) {
1596 #if defined(PETSC_USE_CTABLE)
1597         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1598         row--;
1599         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1600 #else
1601         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1602 #endif
1603         /* Now, store row index of submat in sbuf1_i[ct1] */
1604         sbuf1_i[ct1] = row;
1605 
1606         nnz = rbuf2_i[ct1];
1607         if (!allcolumns) {
1608           for (l=0; l<nnz; l++,ct2++) {
1609 #if defined(PETSC_USE_CTABLE)
1610             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1611               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1612             } else {
1613               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1614             }
1615 #else
1616             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1617 #endif
1618             if (tcol) lens[row]++;
1619           }
1620         } else { /* allcolumns */
1621           lens[row] += nnz;
1622         }
1623       }
1624     }
1625     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1626     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1627 
1628     /* Create the submatrices */
1629     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1630     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1631 
1632     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1633     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1634     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1635     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1636     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1637 
1638     /* create struct Mat_SubSppt and attached it to submat */
1639     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1640     subc = (Mat_SeqAIJ*)submat->data;
1641     subc->submatis1 = smatis1;
1642 
1643     smatis1->id          = 0;
1644     smatis1->nrqs        = nrqs;
1645     smatis1->nrqr        = nrqr;
1646     smatis1->rbuf1       = rbuf1;
1647     smatis1->rbuf2       = rbuf2;
1648     smatis1->rbuf3       = rbuf3;
1649     smatis1->sbuf2       = sbuf2;
1650     smatis1->req_source2 = req_source2;
1651 
1652     smatis1->sbuf1       = sbuf1;
1653     smatis1->ptr         = ptr;
1654     smatis1->tmp         = tmp;
1655     smatis1->ctr         = ctr;
1656 
1657     smatis1->pa           = pa;
1658     smatis1->req_size     = req_size;
1659     smatis1->req_source1  = req_source1;
1660 
1661     smatis1->allcolumns  = allcolumns;
1662     smatis1->singleis    = PETSC_TRUE;
1663     smatis1->row2proc    = row2proc;
1664     smatis1->rmap        = rmap;
1665     smatis1->cmap        = cmap;
1666 #if defined(PETSC_USE_CTABLE)
1667     smatis1->rmap_loc    = rmap_loc;
1668     smatis1->cmap_loc    = cmap_loc;
1669 #endif
1670 
1671     smatis1->destroy     = submat->ops->destroy;
1672     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1673     submat->factortype   = C->factortype;
1674 
1675     /* compute rmax */
1676     rmax = 0;
1677     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1678 
1679   } else { /* scall == MAT_REUSE_MATRIX */
1680     submat = submats[0];
1681     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1682 
1683     subc    = (Mat_SeqAIJ*)submat->data;
1684     rmax    = subc->rmax;
1685     smatis1 = subc->submatis1;
1686     nrqs        = smatis1->nrqs;
1687     nrqr        = smatis1->nrqr;
1688     rbuf1       = smatis1->rbuf1;
1689     rbuf2       = smatis1->rbuf2;
1690     rbuf3       = smatis1->rbuf3;
1691     req_source2 = smatis1->req_source2;
1692 
1693     sbuf1     = smatis1->sbuf1;
1694     sbuf2     = smatis1->sbuf2;
1695     ptr       = smatis1->ptr;
1696     tmp       = smatis1->tmp;
1697     ctr       = smatis1->ctr;
1698 
1699     pa         = smatis1->pa;
1700     req_size   = smatis1->req_size;
1701     req_source1 = smatis1->req_source1;
1702 
1703     allcolumns = smatis1->allcolumns;
1704     row2proc   = smatis1->row2proc;
1705     rmap       = smatis1->rmap;
1706     cmap       = smatis1->cmap;
1707 #if defined(PETSC_USE_CTABLE)
1708     rmap_loc   = smatis1->rmap_loc;
1709     cmap_loc   = smatis1->cmap_loc;
1710 #endif
1711   }
1712 
1713   /* Post recv matrix values */
1714   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1715   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1716   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1717   for (i=0; i<nrqs; ++i) {
1718     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1719     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1720   }
1721 
1722   /* Allocate sending buffers for a->a, and send them off */
1723   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1724   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1725   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1726   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1727 
1728   for (i=0; i<nrqr; i++) {
1729     rbuf1_i   = rbuf1[i];
1730     sbuf_aa_i = sbuf_aa[i];
1731     ct1       = 2*rbuf1_i[0]+1;
1732     ct2       = 0;
1733     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1734 
1735     kmax = rbuf1_i[2];
1736     for (k=0; k<kmax; k++,ct1++) {
1737       row = rbuf1_i[ct1] - rstart;
1738       nzA = ai[row+1] - ai[row];
1739       nzB = bi[row+1] - bi[row];
1740       ncols  = nzA + nzB;
1741       cworkB = bj + bi[row];
1742       vworkA = a_a + ai[row];
1743       vworkB = b_a + bi[row];
1744 
1745       /* load the column values for this row into vals*/
1746       vals = sbuf_aa_i + ct2;
1747 
1748       lwrite = 0;
1749       for (l=0; l<nzB; l++) {
1750         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1751       }
1752       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1753       for (l=0; l<nzB; l++) {
1754         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1755       }
1756 
1757       ct2 += ncols;
1758     }
1759     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1760   }
1761 
1762   /* Assemble submat */
1763   /* First assemble the local rows */
1764   for (j=0; j<nrow; j++) {
1765     row  = irow[j];
1766     proc = row2proc[j];
1767     if (proc == rank) {
1768       Crow = row - rstart;  /* local row index of C */
1769 #if defined(PETSC_USE_CTABLE)
1770       row = rmap_loc[Crow]; /* row index of submat */
1771 #else
1772       row = rmap[row];
1773 #endif
1774 
1775       if (allcolumns) {
1776         /* diagonal part A = c->A */
1777         ncols = ai[Crow+1] - ai[Crow];
1778         cols  = aj + ai[Crow];
1779         vals  = a->a + ai[Crow];
1780         i     = 0;
1781         for (k=0; k<ncols; k++) {
1782           subcols[i]   = cols[k] + cstart;
1783           subvals[i++] = vals[k];
1784         }
1785 
1786         /* off-diagonal part B = c->B */
1787         ncols = bi[Crow+1] - bi[Crow];
1788         cols  = bj + bi[Crow];
1789         vals  = b->a + bi[Crow];
1790         for (k=0; k<ncols; k++) {
1791           subcols[i]   = bmap[cols[k]];
1792           subvals[i++] = vals[k];
1793         }
1794 
1795         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1796 
1797       } else { /* !allcolumns */
1798 #if defined(PETSC_USE_CTABLE)
1799         /* diagonal part A = c->A */
1800         ncols = ai[Crow+1] - ai[Crow];
1801         cols  = aj + ai[Crow];
1802         vals  = a->a + ai[Crow];
1803         i     = 0;
1804         for (k=0; k<ncols; k++) {
1805           tcol = cmap_loc[cols[k]];
1806           if (tcol) {
1807             subcols[i]   = --tcol;
1808             subvals[i++] = vals[k];
1809           }
1810         }
1811 
1812         /* off-diagonal part B = c->B */
1813         ncols = bi[Crow+1] - bi[Crow];
1814         cols  = bj + bi[Crow];
1815         vals  = b->a + bi[Crow];
1816         for (k=0; k<ncols; k++) {
1817           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1818           if (tcol) {
1819             subcols[i]   = --tcol;
1820             subvals[i++] = vals[k];
1821           }
1822         }
1823 #else
1824         /* diagonal part A = c->A */
1825         ncols = ai[Crow+1] - ai[Crow];
1826         cols  = aj + ai[Crow];
1827         vals  = a->a + ai[Crow];
1828         i     = 0;
1829         for (k=0; k<ncols; k++) {
1830           tcol = cmap[cols[k]+cstart];
1831           if (tcol) {
1832             subcols[i]   = --tcol;
1833             subvals[i++] = vals[k];
1834           }
1835         }
1836 
1837         /* off-diagonal part B = c->B */
1838         ncols = bi[Crow+1] - bi[Crow];
1839         cols  = bj + bi[Crow];
1840         vals  = b->a + bi[Crow];
1841         for (k=0; k<ncols; k++) {
1842           tcol = cmap[bmap[cols[k]]];
1843           if (tcol) {
1844             subcols[i]   = --tcol;
1845             subvals[i++] = vals[k];
1846           }
1847         }
1848 #endif
1849         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1850       }
1851     }
1852   }
1853 
1854   /* Now assemble the off-proc rows */
1855   for (i=0; i<nrqs; i++) { /* for each requested message */
1856     /* recv values from other processes */
1857     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1858     proc    = pa[idex];
1859     sbuf1_i = sbuf1[proc];
1860     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1861     ct1     = 2 + 1;
1862     ct2     = 0; /* count of received C->j */
1863     ct3     = 0; /* count of received C->j that will be inserted into submat */
1864     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1865     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1866     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1867 
1868     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1869     max1 = sbuf1_i[2];             /* num of rows */
1870     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1871       row = sbuf1_i[ct1]; /* row index of submat */
1872       if (!allcolumns) {
1873         idex = 0;
1874         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1875           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1876           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1877 #if defined(PETSC_USE_CTABLE)
1878             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1879               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1880             } else {
1881               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1882             }
1883 #else
1884             tcol = cmap[rbuf3_i[ct2]];
1885 #endif
1886             if (tcol) {
1887               subcols[idex]   = --tcol; /* may not be sorted */
1888               subvals[idex++] = rbuf4_i[ct2];
1889 
1890               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1891                For reuse, we replace received C->j with index that should be inserted to submat */
1892               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1893             }
1894           }
1895           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1896         } else { /* scall == MAT_REUSE_MATRIX */
1897           submat = submats[0];
1898           subc   = (Mat_SeqAIJ*)submat->data;
1899 
1900           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1901           for (l=0; l<nnz; l++) {
1902             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1903             subvals[idex++] = rbuf4_i[ct2];
1904           }
1905 
1906           bj = subc->j + subc->i[row]; /* sorted column indices */
1907           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1908         }
1909       } else { /* allcolumns */
1910         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1911         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1912         ct2 += nnz;
1913       }
1914     }
1915   }
1916 
1917   /* sending a->a are done */
1918   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1919   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1920 
1921   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1922   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1923   submats[0] = submat;
1924 
1925   /* Restore the indices */
1926   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1927   if (!allcolumns) {
1928     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1929   }
1930 
1931   /* Destroy allocated memory */
1932   for (i=0; i<nrqs; ++i) {
1933     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1934   }
1935   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1936   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1937   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1938 
1939   if (scall == MAT_INITIAL_MATRIX) {
1940     ierr = PetscFree(lens);CHKERRQ(ierr);
1941     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1942     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1943   }
1944   PetscFunctionReturn(0);
1945 }
1946 
1947 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1948 {
1949   PetscErrorCode ierr;
1950   PetscInt       ncol;
1951   PetscBool      colflag,allcolumns=PETSC_FALSE;
1952 
1953   PetscFunctionBegin;
1954   /* Allocate memory to hold all the submatrices */
1955   if (scall == MAT_INITIAL_MATRIX) {
1956     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1957   }
1958 
1959   /* Check for special case: each processor gets entire matrix columns */
1960   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1961   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1962   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1963 
1964   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1965   PetscFunctionReturn(0);
1966 }
1967 
1968 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1969 {
1970   PetscErrorCode ierr;
1971   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1972   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1973   Mat_SeqAIJ     *subc;
1974   Mat_SubSppt    *smat;
1975 
1976   PetscFunctionBegin;
1977   /* Check for special case: each processor has a single IS */
1978   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1979     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1980     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1981     PetscFunctionReturn(0);
1982   }
1983 
1984   /* Collect global wantallmatrix and nstages */
1985   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1986   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1987   if (!nmax) nmax = 1;
1988 
1989   if (scall == MAT_INITIAL_MATRIX) {
1990     /* Collect global wantallmatrix and nstages */
1991     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1992       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1993       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1994       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1995       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1996       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1997         wantallmatrix = PETSC_TRUE;
1998 
1999         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2000       }
2001     }
2002 
2003     /* Determine the number of stages through which submatrices are done
2004        Each stage will extract nmax submatrices.
2005        nmax is determined by the matrix column dimension.
2006        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2007     */
2008     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2009 
2010     in[0] = -1*(PetscInt)wantallmatrix;
2011     in[1] = nstages;
2012     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2013     wantallmatrix = (PetscBool)(-out[0]);
2014     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2015 
2016   } else { /* MAT_REUSE_MATRIX */
2017     if (ismax) {
2018       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2019       smat = subc->submatis1;
2020     } else { /* (*submat)[0] is a dummy matrix */
2021       smat = (Mat_SubSppt*)(*submat)[0]->data;
2022     }
2023     if (!smat) {
2024       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2025       wantallmatrix = PETSC_TRUE;
2026     } else if (smat->singleis) {
2027       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2028       PetscFunctionReturn(0);
2029     } else {
2030       nstages = smat->nstages;
2031     }
2032   }
2033 
2034   if (wantallmatrix) {
2035     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2036     PetscFunctionReturn(0);
2037   }
2038 
2039   /* Allocate memory to hold all the submatrices and dummy submatrices */
2040   if (scall == MAT_INITIAL_MATRIX) {
2041     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2042   }
2043 
2044   for (i=0,pos=0; i<nstages; i++) {
2045     if (pos+nmax <= ismax) max_no = nmax;
2046     else if (pos >= ismax) max_no = 0;
2047     else                   max_no = ismax-pos;
2048 
2049     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2050     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2051       smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2052       smat->nstages = nstages;
2053     }
2054     pos += max_no;
2055   }
2056 
2057   if (ismax && scall == MAT_INITIAL_MATRIX) {
2058     /* save nstages for reuse */
2059     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2060     smat = subc->submatis1;
2061     smat->nstages = nstages;
2062   }
2063   PetscFunctionReturn(0);
2064 }
2065 
2066 /* -------------------------------------------------------------------------*/
2067 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2068 {
2069   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2070   Mat            A  = c->A;
2071   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2072   const PetscInt **icol,**irow;
2073   PetscInt       *nrow,*ncol,start;
2074   PetscErrorCode ierr;
2075   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2076   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2077   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2078   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2079   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2080 #if defined(PETSC_USE_CTABLE)
2081   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2082 #else
2083   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2084 #endif
2085   const PetscInt *irow_i;
2086   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2087   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2088   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2089   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2090   MPI_Status     *r_status3,*r_status4,*s_status4;
2091   MPI_Comm       comm;
2092   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2093   PetscMPIInt    *onodes1,*olengths1,end;
2094   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2095   Mat_SubSppt    *smat_i;
2096   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2097   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2098 
2099   PetscFunctionBegin;
2100   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2101   size = c->size;
2102   rank = c->rank;
2103 
2104   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2105   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2106 
2107   for (i=0; i<ismax; i++) {
2108     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2109     if (!issorted[i]) iscsorted = issorted[i];
2110 
2111     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2112 
2113     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2114     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2115 
2116     /* Check for special case: allcolumn */
2117     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2118     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2119     if (colflag && ncol[i] == C->cmap->N) {
2120       allcolumns[i] = PETSC_TRUE;
2121       icol[i] = NULL;
2122     } else {
2123       allcolumns[i] = PETSC_FALSE;
2124       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2125     }
2126   }
2127 
2128   if (scall == MAT_REUSE_MATRIX) {
2129     /* Assumes new rows are same length as the old rows */
2130     for (i=0; i<ismax; i++) {
2131       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2132       subc = (Mat_SeqAIJ*)submats[i]->data;
2133       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2134 
2135       /* Initial matrix as if empty */
2136       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2137 
2138       smat_i   = subc->submatis1;
2139 
2140       nrqs        = smat_i->nrqs;
2141       nrqr        = smat_i->nrqr;
2142       rbuf1       = smat_i->rbuf1;
2143       rbuf2       = smat_i->rbuf2;
2144       rbuf3       = smat_i->rbuf3;
2145       req_source2 = smat_i->req_source2;
2146 
2147       sbuf1     = smat_i->sbuf1;
2148       sbuf2     = smat_i->sbuf2;
2149       ptr       = smat_i->ptr;
2150       tmp       = smat_i->tmp;
2151       ctr       = smat_i->ctr;
2152 
2153       pa          = smat_i->pa;
2154       req_size    = smat_i->req_size;
2155       req_source1 = smat_i->req_source1;
2156 
2157       allcolumns[i] = smat_i->allcolumns;
2158       row2proc[i]   = smat_i->row2proc;
2159       rmap[i]       = smat_i->rmap;
2160       cmap[i]       = smat_i->cmap;
2161     }
2162 
2163     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2164       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2165       smat_i = (Mat_SubSppt*)submats[0]->data;
2166 
2167       nrqs        = smat_i->nrqs;
2168       nrqr        = smat_i->nrqr;
2169       rbuf1       = smat_i->rbuf1;
2170       rbuf2       = smat_i->rbuf2;
2171       rbuf3       = smat_i->rbuf3;
2172       req_source2 = smat_i->req_source2;
2173 
2174       sbuf1       = smat_i->sbuf1;
2175       sbuf2       = smat_i->sbuf2;
2176       ptr         = smat_i->ptr;
2177       tmp         = smat_i->tmp;
2178       ctr         = smat_i->ctr;
2179 
2180       pa          = smat_i->pa;
2181       req_size    = smat_i->req_size;
2182       req_source1 = smat_i->req_source1;
2183 
2184       allcolumns[0] = PETSC_FALSE;
2185     }
2186   } else { /* scall == MAT_INITIAL_MATRIX */
2187     /* Get some new tags to keep the communication clean */
2188     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2189     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2190 
2191     /* evaluate communication - mesg to who, length of mesg, and buffer space
2192      required. Based on this, buffers are allocated, and data copied into them*/
2193     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2194 
2195     for (i=0; i<ismax; i++) {
2196       jmax   = nrow[i];
2197       irow_i = irow[i];
2198 
2199       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2200       row2proc[i] = row2proc_i;
2201 
2202       if (issorted[i]) proc = 0;
2203       for (j=0; j<jmax; j++) {
2204         if (!issorted[i]) proc = 0;
2205         row = irow_i[j];
2206         while (row >= C->rmap->range[proc+1]) proc++;
2207         w4[proc]++;
2208         row2proc_i[j] = proc; /* map row index to proc */
2209       }
2210       for (j=0; j<size; j++) {
2211         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2212       }
2213     }
2214 
2215     nrqs     = 0;              /* no of outgoing messages */
2216     msz      = 0;              /* total mesg length (for all procs) */
2217     w1[rank] = 0;              /* no mesg sent to self */
2218     w3[rank] = 0;
2219     for (i=0; i<size; i++) {
2220       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2221     }
2222     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2223     for (i=0,j=0; i<size; i++) {
2224       if (w1[i]) { pa[j] = i; j++; }
2225     }
2226 
2227     /* Each message would have a header = 1 + 2*(no of IS) + data */
2228     for (i=0; i<nrqs; i++) {
2229       j      = pa[i];
2230       w1[j] += w2[j] + 2* w3[j];
2231       msz   += w1[j];
2232     }
2233     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2234 
2235     /* Determine the number of messages to expect, their lengths, from from-ids */
2236     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2237     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2238 
2239     /* Now post the Irecvs corresponding to these messages */
2240     tag0 = ((PetscObject)C)->tag;
2241     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2242 
2243     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2244     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2245 
2246     /* Allocate Memory for outgoing messages */
2247     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2248     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2249     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2250 
2251     {
2252       PetscInt *iptr = tmp;
2253       k    = 0;
2254       for (i=0; i<nrqs; i++) {
2255         j        = pa[i];
2256         iptr    += k;
2257         sbuf1[j] = iptr;
2258         k        = w1[j];
2259       }
2260     }
2261 
2262     /* Form the outgoing messages. Initialize the header space */
2263     for (i=0; i<nrqs; i++) {
2264       j           = pa[i];
2265       sbuf1[j][0] = 0;
2266       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2267       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2268     }
2269 
2270     /* Parse the isrow and copy data into outbuf */
2271     for (i=0; i<ismax; i++) {
2272       row2proc_i = row2proc[i];
2273       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2274       irow_i = irow[i];
2275       jmax   = nrow[i];
2276       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2277         proc = row2proc_i[j];
2278         if (proc != rank) { /* copy to the outgoing buf*/
2279           ctr[proc]++;
2280           *ptr[proc] = irow_i[j];
2281           ptr[proc]++;
2282         }
2283       }
2284       /* Update the headers for the current IS */
2285       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2286         if ((ctr_j = ctr[j])) {
2287           sbuf1_j        = sbuf1[j];
2288           k              = ++sbuf1_j[0];
2289           sbuf1_j[2*k]   = ctr_j;
2290           sbuf1_j[2*k-1] = i;
2291         }
2292       }
2293     }
2294 
2295     /*  Now  post the sends */
2296     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2297     for (i=0; i<nrqs; ++i) {
2298       j    = pa[i];
2299       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2300     }
2301 
2302     /* Post Receives to capture the buffer size */
2303     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2304     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2305     rbuf2[0] = tmp + msz;
2306     for (i=1; i<nrqs; ++i) {
2307       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2308     }
2309     for (i=0; i<nrqs; ++i) {
2310       j    = pa[i];
2311       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2312     }
2313 
2314     /* Send to other procs the buf size they should allocate */
2315     /* Receive messages*/
2316     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2317     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2318     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2319     {
2320       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2321       PetscInt   *sbuf2_i;
2322 
2323       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2324       for (i=0; i<nrqr; ++i) {
2325         req_size[i] = 0;
2326         rbuf1_i        = rbuf1[i];
2327         start          = 2*rbuf1_i[0] + 1;
2328         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2329         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2330         sbuf2_i        = sbuf2[i];
2331         for (j=start; j<end; j++) {
2332           id              = rbuf1_i[j] - rstart;
2333           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2334           sbuf2_i[j]      = ncols;
2335           req_size[i] += ncols;
2336         }
2337         req_source1[i] = r_status1[i].MPI_SOURCE;
2338         /* form the header */
2339         sbuf2_i[0] = req_size[i];
2340         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2341 
2342         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2343       }
2344     }
2345     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2346     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2347     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2348 
2349     /* Receive messages*/
2350     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2351     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2352 
2353     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2354     for (i=0; i<nrqs; ++i) {
2355       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2356       req_source2[i] = r_status2[i].MPI_SOURCE;
2357       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2358     }
2359     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2360     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2361 
2362     /* Wait on sends1 and sends2 */
2363     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2364     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2365 
2366     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2367     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2368     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2369     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2370     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2371     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2372 
2373     /* Now allocate sending buffers for a->j, and send them off */
2374     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2375     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2376     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2377     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2378 
2379     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2380     {
2381       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2382       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2383       PetscInt cend = C->cmap->rend;
2384       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2385 
2386       for (i=0; i<nrqr; i++) {
2387         rbuf1_i   = rbuf1[i];
2388         sbuf_aj_i = sbuf_aj[i];
2389         ct1       = 2*rbuf1_i[0] + 1;
2390         ct2       = 0;
2391         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2392           kmax = rbuf1[i][2*j];
2393           for (k=0; k<kmax; k++,ct1++) {
2394             row    = rbuf1_i[ct1] - rstart;
2395             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2396             ncols  = nzA + nzB;
2397             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2398 
2399             /* load the column indices for this row into cols */
2400             cols = sbuf_aj_i + ct2;
2401 
2402             lwrite = 0;
2403             for (l=0; l<nzB; l++) {
2404               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2405             }
2406             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2407             for (l=0; l<nzB; l++) {
2408               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2409             }
2410 
2411             ct2 += ncols;
2412           }
2413         }
2414         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2415       }
2416     }
2417     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2418 
2419     /* create col map: global col of C -> local col of submatrices */
2420     {
2421       const PetscInt *icol_i;
2422 #if defined(PETSC_USE_CTABLE)
2423       for (i=0; i<ismax; i++) {
2424         if (!allcolumns[i]) {
2425           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2426 
2427           jmax   = ncol[i];
2428           icol_i = icol[i];
2429           cmap_i = cmap[i];
2430           for (j=0; j<jmax; j++) {
2431             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2432           }
2433         } else cmap[i] = NULL;
2434       }
2435 #else
2436       for (i=0; i<ismax; i++) {
2437         if (!allcolumns[i]) {
2438           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2439           jmax   = ncol[i];
2440           icol_i = icol[i];
2441           cmap_i = cmap[i];
2442           for (j=0; j<jmax; j++) {
2443             cmap_i[icol_i[j]] = j+1;
2444           }
2445         } else cmap[i] = NULL;
2446       }
2447 #endif
2448     }
2449 
2450     /* Create lens which is required for MatCreate... */
2451     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2452     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2453 
2454     if (ismax) {
2455       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2456     }
2457     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2458 
2459     /* Update lens from local data */
2460     for (i=0; i<ismax; i++) {
2461       row2proc_i = row2proc[i];
2462       jmax = nrow[i];
2463       if (!allcolumns[i]) cmap_i = cmap[i];
2464       irow_i = irow[i];
2465       lens_i = lens[i];
2466       for (j=0; j<jmax; j++) {
2467         row = irow_i[j];
2468         proc = row2proc_i[j];
2469         if (proc == rank) {
2470           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2471           if (!allcolumns[i]) {
2472             for (k=0; k<ncols; k++) {
2473 #if defined(PETSC_USE_CTABLE)
2474               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2475 #else
2476               tcol = cmap_i[cols[k]];
2477 #endif
2478               if (tcol) lens_i[j]++;
2479             }
2480           } else { /* allcolumns */
2481             lens_i[j] = ncols;
2482           }
2483           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2484         }
2485       }
2486     }
2487 
2488     /* Create row map: global row of C -> local row of submatrices */
2489 #if defined(PETSC_USE_CTABLE)
2490     for (i=0; i<ismax; i++) {
2491       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2492       irow_i = irow[i];
2493       jmax   = nrow[i];
2494       for (j=0; j<jmax; j++) {
2495       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2496       }
2497     }
2498 #else
2499     for (i=0; i<ismax; i++) {
2500       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2501       rmap_i = rmap[i];
2502       irow_i = irow[i];
2503       jmax   = nrow[i];
2504       for (j=0; j<jmax; j++) {
2505         rmap_i[irow_i[j]] = j;
2506       }
2507     }
2508 #endif
2509 
2510     /* Update lens from offproc data */
2511     {
2512       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2513 
2514       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2515       for (tmp2=0; tmp2<nrqs; tmp2++) {
2516         sbuf1_i = sbuf1[pa[tmp2]];
2517         jmax    = sbuf1_i[0];
2518         ct1     = 2*jmax+1;
2519         ct2     = 0;
2520         rbuf2_i = rbuf2[tmp2];
2521         rbuf3_i = rbuf3[tmp2];
2522         for (j=1; j<=jmax; j++) {
2523           is_no  = sbuf1_i[2*j-1];
2524           max1   = sbuf1_i[2*j];
2525           lens_i = lens[is_no];
2526           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2527           rmap_i = rmap[is_no];
2528           for (k=0; k<max1; k++,ct1++) {
2529 #if defined(PETSC_USE_CTABLE)
2530             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2531             row--;
2532             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2533 #else
2534             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2535 #endif
2536             max2 = rbuf2_i[ct1];
2537             for (l=0; l<max2; l++,ct2++) {
2538               if (!allcolumns[is_no]) {
2539 #if defined(PETSC_USE_CTABLE)
2540                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2541 #else
2542                 tcol = cmap_i[rbuf3_i[ct2]];
2543 #endif
2544                 if (tcol) lens_i[row]++;
2545               } else { /* allcolumns */
2546                 lens_i[row]++; /* lens_i[row] += max2 ? */
2547               }
2548             }
2549           }
2550         }
2551       }
2552     }
2553     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2554     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2555     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2556     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2557 
2558     /* Create the submatrices */
2559     for (i=0; i<ismax; i++) {
2560       PetscInt    rbs,cbs;
2561 
2562       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2563       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2564 
2565       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2566       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2567 
2568       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2569       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2570       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2571 
2572       /* create struct Mat_SubSppt and attached it to submat */
2573       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2574       subc = (Mat_SeqAIJ*)submats[i]->data;
2575       subc->submatis1 = smat_i;
2576 
2577       smat_i->destroy          = submats[i]->ops->destroy;
2578       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2579       submats[i]->factortype   = C->factortype;
2580 
2581       smat_i->id          = i;
2582       smat_i->nrqs        = nrqs;
2583       smat_i->nrqr        = nrqr;
2584       smat_i->rbuf1       = rbuf1;
2585       smat_i->rbuf2       = rbuf2;
2586       smat_i->rbuf3       = rbuf3;
2587       smat_i->sbuf2       = sbuf2;
2588       smat_i->req_source2 = req_source2;
2589 
2590       smat_i->sbuf1       = sbuf1;
2591       smat_i->ptr         = ptr;
2592       smat_i->tmp         = tmp;
2593       smat_i->ctr         = ctr;
2594 
2595       smat_i->pa           = pa;
2596       smat_i->req_size     = req_size;
2597       smat_i->req_source1  = req_source1;
2598 
2599       smat_i->allcolumns  = allcolumns[i];
2600       smat_i->singleis    = PETSC_FALSE;
2601       smat_i->row2proc    = row2proc[i];
2602       smat_i->rmap        = rmap[i];
2603       smat_i->cmap        = cmap[i];
2604     }
2605 
2606     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2607       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2608       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2609       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2610 
2611       /* create struct Mat_SubSppt and attached it to submat */
2612       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2613       submats[0]->data = (void*)smat_i;
2614 
2615       smat_i->destroy          = submats[0]->ops->destroy;
2616       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2617       submats[0]->factortype   = C->factortype;
2618 
2619       smat_i->id          = 0;
2620       smat_i->nrqs        = nrqs;
2621       smat_i->nrqr        = nrqr;
2622       smat_i->rbuf1       = rbuf1;
2623       smat_i->rbuf2       = rbuf2;
2624       smat_i->rbuf3       = rbuf3;
2625       smat_i->sbuf2       = sbuf2;
2626       smat_i->req_source2 = req_source2;
2627 
2628       smat_i->sbuf1       = sbuf1;
2629       smat_i->ptr         = ptr;
2630       smat_i->tmp         = tmp;
2631       smat_i->ctr         = ctr;
2632 
2633       smat_i->pa           = pa;
2634       smat_i->req_size     = req_size;
2635       smat_i->req_source1  = req_source1;
2636 
2637       smat_i->allcolumns  = PETSC_FALSE;
2638       smat_i->singleis    = PETSC_FALSE;
2639       smat_i->row2proc    = NULL;
2640       smat_i->rmap        = NULL;
2641       smat_i->cmap        = NULL;
2642     }
2643 
2644     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2645     ierr = PetscFree(lens);CHKERRQ(ierr);
2646     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2647     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2648 
2649   } /* endof scall == MAT_INITIAL_MATRIX */
2650 
2651   /* Post recv matrix values */
2652   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2653   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2654   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2655   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2656   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2657   for (i=0; i<nrqs; ++i) {
2658     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2659     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2660   }
2661 
2662   /* Allocate sending buffers for a->a, and send them off */
2663   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2664   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2665   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2666   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2667 
2668   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2669   {
2670     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2671     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2672     PetscInt    cend   = C->cmap->rend;
2673     PetscInt    *b_j   = b->j;
2674     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2675 
2676     for (i=0; i<nrqr; i++) {
2677       rbuf1_i   = rbuf1[i];
2678       sbuf_aa_i = sbuf_aa[i];
2679       ct1       = 2*rbuf1_i[0]+1;
2680       ct2       = 0;
2681       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2682         kmax = rbuf1_i[2*j];
2683         for (k=0; k<kmax; k++,ct1++) {
2684           row    = rbuf1_i[ct1] - rstart;
2685           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2686           ncols  = nzA + nzB;
2687           cworkB = b_j + b_i[row];
2688           vworkA = a_a + a_i[row];
2689           vworkB = b_a + b_i[row];
2690 
2691           /* load the column values for this row into vals*/
2692           vals = sbuf_aa_i+ct2;
2693 
2694           lwrite = 0;
2695           for (l=0; l<nzB; l++) {
2696             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2697           }
2698           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2699           for (l=0; l<nzB; l++) {
2700             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2701           }
2702 
2703           ct2 += ncols;
2704         }
2705       }
2706       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2707     }
2708   }
2709 
2710   /* Assemble the matrices */
2711   /* First assemble the local rows */
2712   for (i=0; i<ismax; i++) {
2713     row2proc_i = row2proc[i];
2714     subc      = (Mat_SeqAIJ*)submats[i]->data;
2715     imat_ilen = subc->ilen;
2716     imat_j    = subc->j;
2717     imat_i    = subc->i;
2718     imat_a    = subc->a;
2719 
2720     if (!allcolumns[i]) cmap_i = cmap[i];
2721     rmap_i = rmap[i];
2722     irow_i = irow[i];
2723     jmax   = nrow[i];
2724     for (j=0; j<jmax; j++) {
2725       row  = irow_i[j];
2726       proc = row2proc_i[j];
2727       if (proc == rank) {
2728         old_row = row;
2729 #if defined(PETSC_USE_CTABLE)
2730         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2731         row--;
2732 #else
2733         row = rmap_i[row];
2734 #endif
2735         ilen_row = imat_ilen[row];
2736         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2737         mat_i    = imat_i[row];
2738         mat_a    = imat_a + mat_i;
2739         mat_j    = imat_j + mat_i;
2740         if (!allcolumns[i]) {
2741           for (k=0; k<ncols; k++) {
2742 #if defined(PETSC_USE_CTABLE)
2743             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2744 #else
2745             tcol = cmap_i[cols[k]];
2746 #endif
2747             if (tcol) {
2748               *mat_j++ = tcol - 1;
2749               *mat_a++ = vals[k];
2750               ilen_row++;
2751             }
2752           }
2753         } else { /* allcolumns */
2754           for (k=0; k<ncols; k++) {
2755             *mat_j++ = cols[k];  /* global col index! */
2756             *mat_a++ = vals[k];
2757             ilen_row++;
2758           }
2759         }
2760         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2761 
2762         imat_ilen[row] = ilen_row;
2763       }
2764     }
2765   }
2766 
2767   /* Now assemble the off proc rows */
2768   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2769   for (tmp2=0; tmp2<nrqs; tmp2++) {
2770     sbuf1_i = sbuf1[pa[tmp2]];
2771     jmax    = sbuf1_i[0];
2772     ct1     = 2*jmax + 1;
2773     ct2     = 0;
2774     rbuf2_i = rbuf2[tmp2];
2775     rbuf3_i = rbuf3[tmp2];
2776     rbuf4_i = rbuf4[tmp2];
2777     for (j=1; j<=jmax; j++) {
2778       is_no     = sbuf1_i[2*j-1];
2779       rmap_i    = rmap[is_no];
2780       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2781       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2782       imat_ilen = subc->ilen;
2783       imat_j    = subc->j;
2784       imat_i    = subc->i;
2785       imat_a    = subc->a;
2786       max1      = sbuf1_i[2*j];
2787       for (k=0; k<max1; k++,ct1++) {
2788         row = sbuf1_i[ct1];
2789 #if defined(PETSC_USE_CTABLE)
2790         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2791         row--;
2792 #else
2793         row = rmap_i[row];
2794 #endif
2795         ilen  = imat_ilen[row];
2796         mat_i = imat_i[row];
2797         mat_a = imat_a + mat_i;
2798         mat_j = imat_j + mat_i;
2799         max2  = rbuf2_i[ct1];
2800         if (!allcolumns[is_no]) {
2801           for (l=0; l<max2; l++,ct2++) {
2802 #if defined(PETSC_USE_CTABLE)
2803             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2804 #else
2805             tcol = cmap_i[rbuf3_i[ct2]];
2806 #endif
2807             if (tcol) {
2808               *mat_j++ = tcol - 1;
2809               *mat_a++ = rbuf4_i[ct2];
2810               ilen++;
2811             }
2812           }
2813         } else { /* allcolumns */
2814           for (l=0; l<max2; l++,ct2++) {
2815             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2816             *mat_a++ = rbuf4_i[ct2];
2817             ilen++;
2818           }
2819         }
2820         imat_ilen[row] = ilen;
2821       }
2822     }
2823   }
2824 
2825   if (!iscsorted) { /* sort column indices of the rows */
2826     for (i=0; i<ismax; i++) {
2827       subc      = (Mat_SeqAIJ*)submats[i]->data;
2828       imat_j    = subc->j;
2829       imat_i    = subc->i;
2830       imat_a    = subc->a;
2831       imat_ilen = subc->ilen;
2832 
2833       if (allcolumns[i]) continue;
2834       jmax = nrow[i];
2835       for (j=0; j<jmax; j++) {
2836         mat_i = imat_i[j];
2837         mat_a = imat_a + mat_i;
2838         mat_j = imat_j + mat_i;
2839         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2840       }
2841     }
2842   }
2843 
2844   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2845   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2846   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2847   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2848   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2849 
2850   /* Restore the indices */
2851   for (i=0; i<ismax; i++) {
2852     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2853     if (!allcolumns[i]) {
2854       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2855     }
2856   }
2857 
2858   for (i=0; i<ismax; i++) {
2859     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2860     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2861   }
2862 
2863   /* Destroy allocated memory */
2864   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2865   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2866   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2867 
2868   for (i=0; i<nrqs; ++i) {
2869     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2870   }
2871   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2872 
2873   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2874   PetscFunctionReturn(0);
2875 }
2876 
2877 /*
2878  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2879  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2880  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2881  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2882  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2883  state, and needs to be "assembled" later by compressing B's column space.
2884 
2885  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2886  Following this call, C->A & C->B have been created, even if empty.
2887  */
2888 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2889 {
2890   /* If making this function public, change the error returned in this function away from _PLIB. */
2891   PetscErrorCode ierr;
2892   Mat_MPIAIJ     *aij;
2893   Mat_SeqAIJ     *Baij;
2894   PetscBool      seqaij,Bdisassembled;
2895   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2896   PetscScalar    v;
2897   const PetscInt *rowindices,*colindices;
2898 
2899   PetscFunctionBegin;
2900   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2901   if (A) {
2902     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2903     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2904     if (rowemb) {
2905       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2906       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2907     } else {
2908       if (C->rmap->n != A->rmap->n) {
2909 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2910       }
2911     }
2912     if (dcolemb) {
2913       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2914       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2915     } else {
2916       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2917     }
2918   }
2919   if (B) {
2920     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2921     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2922     if (rowemb) {
2923       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2924       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2925     } else {
2926       if (C->rmap->n != B->rmap->n) {
2927 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2928       }
2929     }
2930     if (ocolemb) {
2931       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2932       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2933     } else {
2934       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2935     }
2936   }
2937 
2938   aij = (Mat_MPIAIJ*)C->data;
2939   if (!aij->A) {
2940     /* Mimic parts of MatMPIAIJSetPreallocation() */
2941     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2942     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2943     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2944     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2945     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2946   }
2947   if (A) {
2948     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2949   } else {
2950     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2951   }
2952   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2953     /*
2954       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2955       need to "disassemble" B -- convert it to using C's global indices.
2956       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2957 
2958       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2959       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2960 
2961       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2962       At least avoid calling MatSetValues() and the implied searches?
2963     */
2964 
2965     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2966 #if defined(PETSC_USE_CTABLE)
2967       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2968 #else
2969       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2970       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2971       if (aij->B) {
2972         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2973       }
2974 #endif
2975       ngcol = 0;
2976       if (aij->lvec) {
2977 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2978       }
2979       if (aij->garray) {
2980 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2981 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2982       }
2983       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2984       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2985     }
2986     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2987       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2988     }
2989     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2990       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2991     }
2992   }
2993   Bdisassembled = PETSC_FALSE;
2994   if (!aij->B) {
2995     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2996     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2997     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2998     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2999     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3000     Bdisassembled = PETSC_TRUE;
3001   }
3002   if (B) {
3003     Baij = (Mat_SeqAIJ*)B->data;
3004     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3005       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3006       for (i=0; i<B->rmap->n; i++) {
3007 	nz[i] = Baij->i[i+1] - Baij->i[i];
3008       }
3009       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3010       ierr = PetscFree(nz);CHKERRQ(ierr);
3011     }
3012 
3013     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3014     shift = rend-rstart;
3015     count = 0;
3016     rowindices = NULL;
3017     colindices = NULL;
3018     if (rowemb) {
3019       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3020     }
3021     if (ocolemb) {
3022       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3023     }
3024     for (i=0; i<B->rmap->n; i++) {
3025       PetscInt row;
3026       row = i;
3027       if (rowindices) row = rowindices[i];
3028       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3029 	col  = Baij->j[count];
3030 	if (colindices) col = colindices[col];
3031 	if (Bdisassembled && col>=rstart) col += shift;
3032 	v    = Baij->a[count];
3033 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3034 	++count;
3035       }
3036     }
3037     /* No assembly for aij->B is necessary. */
3038     /* FIXME: set aij->B's nonzerostate correctly. */
3039   } else {
3040     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3041   }
3042   C->preallocated  = PETSC_TRUE;
3043   C->was_assembled = PETSC_FALSE;
3044   C->assembled     = PETSC_FALSE;
3045    /*
3046       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3047       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3048    */
3049   PetscFunctionReturn(0);
3050 }
3051 
3052 /*
3053   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3054  */
3055 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3056 {
3057   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3058 
3059   PetscFunctionBegin;
3060   PetscValidPointer(A,2);
3061   PetscValidPointer(B,3);
3062   /* FIXME: make sure C is assembled */
3063   *A = aij->A;
3064   *B = aij->B;
3065   /* Note that we don't incref *A and *B, so be careful! */
3066   PetscFunctionReturn(0);
3067 }
3068 
3069 /*
3070   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3071   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3072 */
3073 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3074                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3075 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3076 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3077 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3078 {
3079   PetscErrorCode ierr;
3080   PetscMPIInt    isize,flag;
3081   PetscInt       i,ii,cismax,ispar;
3082   Mat            *A,*B;
3083   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3084 
3085   PetscFunctionBegin;
3086   if (!ismax) PetscFunctionReturn(0);
3087 
3088   for (i = 0, cismax = 0; i < ismax; ++i) {
3089     PetscMPIInt isize;
3090     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3091     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3092     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3093     if (isize > 1) ++cismax;
3094   }
3095 
3096   /*
3097      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3098      ispar counts the number of parallel ISs across C's comm.
3099   */
3100   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3101   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3102     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3103     PetscFunctionReturn(0);
3104   }
3105 
3106   /* if (ispar) */
3107   /*
3108     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3109     These are used to extract the off-diag portion of the resulting parallel matrix.
3110     The row IS for the off-diag portion is the same as for the diag portion,
3111     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3112   */
3113   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3114   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3115   for (i = 0, ii = 0; i < ismax; ++i) {
3116     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3117     if (isize > 1) {
3118       /*
3119 	 TODO: This is the part that's ***NOT SCALABLE***.
3120 	 To fix this we need to extract just the indices of C's nonzero columns
3121 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3122 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3123 	 to be done without serializing on the IS list, so, most likely, it is best
3124 	 done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3125       */
3126       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3127       /* Now we have to
3128 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3129 	     were sorted on each rank, concatenated they might no longer be sorted;
3130 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3131 	     indices in the nondecreasing order to the original index positions.
3132 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3133       */
3134       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3135       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3136       ++ii;
3137     }
3138   }
3139   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3140   for (i = 0, ii = 0; i < ismax; ++i) {
3141     PetscInt       j,issize;
3142     const PetscInt *indices;
3143 
3144     /*
3145        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3146      */
3147     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3148     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3149     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3150     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3151     for (j = 1; j < issize; ++j) {
3152       if (indices[j] == indices[j-1]) {
3153 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3154       }
3155     }
3156     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3157 
3158 
3159     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3160     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3161     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3162     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3163     for (j = 1; j < issize; ++j) {
3164       if (indices[j-1] == indices[j]) {
3165 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3166       }
3167     }
3168     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3169     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3170     if (isize > 1) {
3171       cisrow[ii] = isrow[i];
3172       ++ii;
3173     }
3174   }
3175   /*
3176     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3177     array of sequential matrices underlying the resulting parallel matrices.
3178     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3179     contain duplicates.
3180 
3181     There are as many diag matrices as there are original index sets. There are only as many parallel
3182     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3183 
3184     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3185     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3186       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3187       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3188       with A[i] and B[ii] extracted from the corresponding MPI submat.
3189     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3190       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3191       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3192       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3193       values into A[i] and B[ii] sitting inside the corresponding submat.
3194     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3195       A[i], B[ii], AA[i] or BB[ii] matrices.
3196   */
3197   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3198   if (scall == MAT_INITIAL_MATRIX) {
3199     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3200   }
3201 
3202   /* Now obtain the sequential A and B submatrices separately. */
3203   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3204   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3205   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3206 
3207   /*
3208     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3209     matrices A & B have been extracted directly into the parallel matrices containing them, or
3210     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3211     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3212     to have the same sparsity pattern.
3213     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3214     must be constructed for C. This is done by setseqmat(s).
3215   */
3216   for (i = 0, ii = 0; i < ismax; ++i) {
3217     /*
3218        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3219        That way we can avoid sorting and computing permutations when reusing.
3220        To this end:
3221         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3222 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3223 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3224     */
3225     MatStructure pattern;
3226     pattern = DIFFERENT_NONZERO_PATTERN;
3227 
3228     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3229     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3230     if (isize > 1) {
3231       if (scall == MAT_INITIAL_MATRIX) {
3232 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3233 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3234 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3235 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3236 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3237       }
3238       /*
3239 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3240       */
3241       {
3242 	Mat AA,BB;
3243         AA = A[i];
3244         BB = B[ii];
3245 	if (AA || BB) {
3246 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3247 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3248 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3249 	}
3250 
3251         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3252       }
3253       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3254       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3255       ++ii;
3256     } else { /* if (isize == 1) */
3257       if (scall == MAT_REUSE_MATRIX) {
3258         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3259       }
3260       if (isrow_p[i] || iscol_p[i]) {
3261         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3262         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3263 	/* Otherwise A is extracted straight into (*submats)[i]. */
3264 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3265 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3266       } else (*submat)[i] = A[i];
3267     }
3268     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3269     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3270   }
3271   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3272   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3273   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3274   ierr = PetscFree(A);CHKERRQ(ierr);
3275   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3276   PetscFunctionReturn(0);
3277 }
3278 
3279 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3280 {
3281   PetscErrorCode ierr;
3282 
3283   PetscFunctionBegin;
3284   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3285   PetscFunctionReturn(0);
3286 }
3287