xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision efa12513287cff49a2b9648ae83199dcbfaad71a)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++) {
82     length_i  = length[i];
83     indices_i = indices[i];
84     for (j=0; j<length_i; j++) {
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank) {
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows) {
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++) {
115     if (tosizes_temp[i]) {
116       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117       tosizes_temp[i] = nto; /* a map from processor to index */
118       toranks[nto++]  = i; /* processor */
119     }
120   }
121   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   toffsets[0] = 0;
123   for (i=0; i<nto; i++) {
124     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
125     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
126   }
127   /* send information to other processors */
128   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
129   nrecvrows = 0;
130   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
131   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
132   nrecvrows = 0;
133   for (i=0; i<nfrom; i++) {
134     for (j=0; j<fromsizes[2*i]; j++) {
135       remote[nrecvrows].rank    = fromranks[i];
136       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
137     }
138   }
139   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
140   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
141   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
142   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
143   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
144   /* message pair <no of is, row>  */
145   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
146   for (i=0; i<nrrows; i++) {
147     owner = rrow_ranks[i]; /* processor */
148     j     = tosizes_temp[owner]; /* index */
149     todata[toffsets[j]++] = rrow_isids[i];
150     todata[toffsets[j]++] = remoterows[i];
151   }
152   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
153   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
154   ierr = PetscFree(toffsets);CHKERRQ(ierr);
155   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
157   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
158   /* send rows belonging to the remote so that then we could get the overlapping data back */
159   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
160   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
161   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
162   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
163   ierr = PetscFree(fromranks);CHKERRQ(ierr);
164   nrecvrows = 0;
165   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
166   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
167   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
168   nrecvrows = 0;
169   for (i=0; i<nto; i++) {
170     for (j=0; j<tosizes[2*i]; j++) {
171       remote[nrecvrows].rank    = toranks[i];
172       remote[nrecvrows++].index = tosizes[2*i+1]+j;
173     }
174   }
175   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
176   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
177   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
178   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
179   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
180   /* overlap communication and computation */
181   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
182   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
183   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
184   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
185   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
186   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
187   ierr = PetscFree(toranks);CHKERRQ(ierr);
188   ierr = PetscFree(tosizes);CHKERRQ(ierr);
189   ierr = PetscFree(todata);CHKERRQ(ierr);
190   PetscFunctionReturn(0);
191 }
192 
193 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
194 {
195   PetscInt         *isz,isz_i,i,j,is_id, data_size;
196   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
197   const PetscInt   *indices_i_temp;
198   MPI_Comm         *iscomms;
199   PetscErrorCode    ierr;
200 
201   PetscFunctionBegin;
202   max_lsize = 0;
203   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
204   for (i=0; i<nidx; i++){
205     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
206     max_lsize = lsize>max_lsize ? lsize:max_lsize;
207     isz[i]    = lsize;
208   }
209   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
212     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
214     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
215     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
216   }
217   /* retrieve information to get row id and its overlap */
218   for (i=0; i<nrecvs;){
219     is_id     = recvdata[i++];
220     data_size = recvdata[i++];
221     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
222     isz_i     = isz[is_id];
223     for (j=0; j< data_size; j++){
224       col = recvdata[i++];
225       indices_i[isz_i++] = col;
226     }
227     isz[is_id] = isz_i;
228   }
229   /* remove duplicate entities */
230   for (i=0; i<nidx; i++){
231     indices_i = indices_temp+(max_lsize+nrecvs)*i;
232     isz_i     = isz[i];
233     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
234     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
235     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
236   }
237   ierr = PetscFree(isz);CHKERRQ(ierr);
238   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
243 {
244   PetscLayout       rmap,cmap;
245   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
246   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
247   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
248   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
249   Mat               amat,bmat;
250   PetscMPIInt       rank;
251   PetscBool         done;
252   MPI_Comm          comm;
253   PetscErrorCode    ierr;
254 
255   PetscFunctionBegin;
256   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
257   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
258   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
259   /* Even if the mat is symmetric, we still assume it is not symmetric */
260   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
263   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
264   /* total number of nonzero values is used to estimate the memory usage in the next step */
265   tnz  = ai[an]+bi[bn];
266   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
267   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
268   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
269   /* to find the longest message */
270   max_fszs = 0;
271   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
272   /* better way to estimate number of nonzero in the mat??? */
273   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
274   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
275   rows_pos  = 0;
276   totalrows = 0;
277   for (i=0; i<nfrom; i++){
278     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
279     /* group data together */
280     for (j=0; j<fromsizes[2*i]; j+=2){
281       is_id                       = fromrows[rows_pos++];/* no of is */
282       rows_i                      = rows_data[is_id];
283       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
284     }
285     /* estimate a space to avoid multiple allocations  */
286     for (j=0; j<nidx; j++){
287       indvc_ij = 0;
288       rows_i   = rows_data[j];
289       for (l=0; l<rows_pos_i[j]; l++){
290         row    = rows_i[l]-rstart;
291         start  = ai[row];
292         end    = ai[row+1];
293         for (k=start; k<end; k++){ /* Amat */
294           col = aj[k] + cstart;
295           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
296         }
297         start = bi[row];
298         end   = bi[row+1];
299         for (k=start; k<end; k++) { /* Bmat */
300           col = gcols[bj[k]];
301           indices_tmp[indvc_ij++] = col;
302         }
303       }
304       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
305       indv_counts[i*nidx+j] = indvc_ij;
306       totalrows            += indvc_ij;
307     }
308   }
309   /* message triple <no of is, number of rows, rows> */
310   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
311   totalrows = 0;
312   rows_pos  = 0;
313   /* use this code again */
314   for (i=0;i<nfrom;i++){
315     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
316     for (j=0; j<fromsizes[2*i]; j+=2){
317       is_id                       = fromrows[rows_pos++];
318       rows_i                      = rows_data[is_id];
319       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
320     }
321     /* add data  */
322     for (j=0; j<nidx; j++){
323       if (!indv_counts[i*nidx+j]) continue;
324       indvc_ij = 0;
325       sbdata[totalrows++] = j;
326       sbdata[totalrows++] = indv_counts[i*nidx+j];
327       sbsizes[2*i]       += 2;
328       rows_i              = rows_data[j];
329       for (l=0; l<rows_pos_i[j]; l++){
330         row   = rows_i[l]-rstart;
331         start = ai[row];
332         end   = ai[row+1];
333         for (k=start; k<end; k++){ /* Amat */
334           col = aj[k] + cstart;
335           indices_tmp[indvc_ij++] = col;
336         }
337         start = bi[row];
338         end   = bi[row+1];
339         for (k=start; k<end; k++) { /* Bmat */
340           col = gcols[bj[k]];
341           indices_tmp[indvc_ij++] = col;
342         }
343       }
344       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
345       sbsizes[2*i]  += indvc_ij;
346       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
347       totalrows += indvc_ij;
348     }
349   }
350   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
351   offsets[0] = 0;
352   for (i=0; i<nfrom; i++){
353     offsets[i+1]   = offsets[i] + sbsizes[2*i];
354     sbsizes[2*i+1] = offsets[i];
355   }
356   ierr = PetscFree(offsets);CHKERRQ(ierr);
357   if (sbrowsizes) *sbrowsizes = sbsizes;
358   if (sbrows) *sbrows = sbdata;
359   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
360   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
361   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
366 {
367   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
368   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
369   PetscInt          lsize,lsize_tmp;
370   PetscMPIInt       rank,owner;
371   Mat               amat,bmat;
372   PetscBool         done;
373   PetscLayout       cmap,rmap;
374   MPI_Comm          comm;
375   PetscErrorCode    ierr;
376 
377   PetscFunctionBegin;
378   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
379   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
380   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
381   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
384   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
385   /* is it a safe way to compute number of nonzero values ? */
386   tnz  = ai[an]+bi[bn];
387   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
388   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
389   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
390   /* it is a better way to estimate memory than the old implementation
391    * where global size of matrix is used
392    * */
393   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
394   for (i=0; i<nidx; i++) {
395     MPI_Comm iscomm;
396 
397     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
398     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
399     lsize_tmp = 0;
400     for (j=0; j<lsize; j++) {
401       owner = -1;
402       row   = indices[j];
403       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
404       if (owner != rank) continue;
405       /* local number */
406       row  -= rstart;
407       start = ai[row];
408       end   = ai[row+1];
409       for (k=start; k<end; k++) { /* Amat */
410         col = aj[k] + cstart;
411         indices_temp[lsize_tmp++] = col;
412       }
413       start = bi[row];
414       end   = bi[row+1];
415       for (k=start; k<end; k++) { /* Bmat */
416         col = gcols[bj[k]];
417         indices_temp[lsize_tmp++] = col;
418       }
419     }
420    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
421    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
422    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
423    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
424    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
425    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
426   }
427   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
428   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
429   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
430   PetscFunctionReturn(0);
431 }
432 
433 
434 /*
435   Sample message format:
436   If a processor A wants processor B to process some elements corresponding
437   to index sets is[1],is[5]
438   mesg [0] = 2   (no of index sets in the mesg)
439   -----------
440   mesg [1] = 1 => is[1]
441   mesg [2] = sizeof(is[1]);
442   -----------
443   mesg [3] = 5  => is[5]
444   mesg [4] = sizeof(is[5]);
445   -----------
446   mesg [5]
447   mesg [n]  datas[1]
448   -----------
449   mesg[n+1]
450   mesg[m]  data(is[5])
451   -----------
452 
453   Notes:
454   nrqs - no of requests sent (or to be sent out)
455   nrqr - no of requests received (which have to be or which have been processed)
456 */
457 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
458 {
459   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
460   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
461   const PetscInt **idx,*idx_i;
462   PetscInt       *n,**data,len;
463 #if defined(PETSC_USE_CTABLE)
464   PetscTable     *table_data,table_data_i;
465   PetscInt       *tdata,tcount,tcount_max;
466 #else
467   PetscInt       *data_i,*d_p;
468 #endif
469   PetscErrorCode ierr;
470   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
471   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
472   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
473   PetscBT        *table;
474   MPI_Comm       comm;
475   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476   MPI_Status     *s_status,*recv_status;
477   MPI_Comm       *iscomms;
478   char           *t_p;
479 
480   PetscFunctionBegin;
481   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
482   size = c->size;
483   rank = c->rank;
484   M    = C->rmap->N;
485 
486   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
487   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
488 
489   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
490 
491   for (i=0; i<imax; i++) {
492     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
493     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
494   }
495 
496   /* evaluate communication - mesg to who,length of mesg, and buffer space
497      required. Based on this, buffers are allocated, and data copied into them  */
498   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
499   for (i=0; i<imax; i++) {
500     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
501     idx_i = idx[i];
502     len   = n[i];
503     for (j=0; j<len; j++) {
504       row = idx_i[j];
505       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
506       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
507       w4[proc]++;
508     }
509     for (j=0; j<size; j++) {
510       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
511     }
512   }
513 
514   nrqs     = 0;              /* no of outgoing messages */
515   msz      = 0;              /* total mesg length (for all proc */
516   w1[rank] = 0;              /* no mesg sent to intself */
517   w3[rank] = 0;
518   for (i=0; i<size; i++) {
519     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
520   }
521   /* pa - is list of processors to communicate with */
522   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
523   for (i=0,j=0; i<size; i++) {
524     if (w1[i]) {pa[j] = i; j++;}
525   }
526 
527   /* Each message would have a header = 1 + 2*(no of IS) + data */
528   for (i=0; i<nrqs; i++) {
529     j      = pa[i];
530     w1[j] += w2[j] + 2*w3[j];
531     msz   += w1[j];
532   }
533 
534   /* Determine the number of messages to expect, their lengths, from from-ids */
535   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
536   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
537 
538   /* Now post the Irecvs corresponding to these messages */
539   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
540 
541   /* Allocate Memory for outgoing messages */
542   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
543   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
544   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
545 
546   {
547     PetscInt *iptr = tmp,ict  = 0;
548     for (i=0; i<nrqs; i++) {
549       j         = pa[i];
550       iptr     +=  ict;
551       outdat[j] = iptr;
552       ict       = w1[j];
553     }
554   }
555 
556   /* Form the outgoing messages */
557   /* plug in the headers */
558   for (i=0; i<nrqs; i++) {
559     j            = pa[i];
560     outdat[j][0] = 0;
561     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
562     ptr[j]       = outdat[j] + 2*w3[j] + 1;
563   }
564 
565   /* Memory for doing local proc's work */
566   {
567     PetscInt M_BPB_imax = 0;
568 #if defined(PETSC_USE_CTABLE)
569     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
570     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
571     for (i=0; i<imax; i++) {
572       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
573     }
574     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
575     for (i=0; i<imax; i++) {
576       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
577     }
578 #else
579     PetscInt Mimax = 0;
580     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
581     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
582     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
583     for (i=0; i<imax; i++) {
584       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
585       data[i]  = d_p + M*i;
586     }
587 #endif
588   }
589 
590   /* Parse the IS and update local tables and the outgoing buf with the data */
591   {
592     PetscInt n_i,isz_i,*outdat_j,ctr_j;
593     PetscBT  table_i;
594 
595     for (i=0; i<imax; i++) {
596       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
597       n_i     = n[i];
598       table_i = table[i];
599       idx_i   = idx[i];
600 #if defined(PETSC_USE_CTABLE)
601       table_data_i = table_data[i];
602 #else
603       data_i  = data[i];
604 #endif
605       isz_i   = isz[i];
606       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
607         row  = idx_i[j];
608         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
609         if (proc != rank) { /* copy to the outgoing buffer */
610           ctr[proc]++;
611           *ptr[proc] = row;
612           ptr[proc]++;
613         } else if (!PetscBTLookupSet(table_i,row)) {
614 #if defined(PETSC_USE_CTABLE)
615           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
616 #else
617           data_i[isz_i] = row; /* Update the local table */
618 #endif
619           isz_i++;
620         }
621       }
622       /* Update the headers for the current IS */
623       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
624         if ((ctr_j = ctr[j])) {
625           outdat_j        = outdat[j];
626           k               = ++outdat_j[0];
627           outdat_j[2*k]   = ctr_j;
628           outdat_j[2*k-1] = i;
629         }
630       }
631       isz[i] = isz_i;
632     }
633   }
634 
635   /*  Now  post the sends */
636   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
637   for (i=0; i<nrqs; ++i) {
638     j    = pa[i];
639     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
640   }
641 
642   /* No longer need the original indices */
643   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
644   for (i=0; i<imax; ++i) {
645     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
646     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
647   }
648   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
649 
650   for (i=0; i<imax; ++i) {
651     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
652   }
653 
654   /* Do Local work */
655 #if defined(PETSC_USE_CTABLE)
656   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
657 #else
658   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
659 #endif
660 
661   /* Receive messages */
662   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
663   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);}
664 
665   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
666   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRMPI(ierr);}
667 
668   /* Phase 1 sends are complete - deallocate buffers */
669   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
670   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
671 
672   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
673   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
674   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
675   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
676   ierr = PetscFree(rbuf);CHKERRQ(ierr);
677 
678 
679   /* Send the data back */
680   /* Do a global reduction to know the buffer space req for incoming messages */
681   {
682     PetscMPIInt *rw1;
683 
684     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
685 
686     for (i=0; i<nrqr; ++i) {
687       proc = recv_status[i].MPI_SOURCE;
688 
689       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
690       rw1[proc] = isz1[i];
691     }
692     ierr = PetscFree(onodes1);CHKERRQ(ierr);
693     ierr = PetscFree(olengths1);CHKERRQ(ierr);
694 
695     /* Determine the number of messages to expect, their lengths, from from-ids */
696     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
697     ierr = PetscFree(rw1);CHKERRQ(ierr);
698   }
699   /* Now post the Irecvs corresponding to these messages */
700   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
701 
702   /* Now  post the sends */
703   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
704   for (i=0; i<nrqr; ++i) {
705     j    = recv_status[i].MPI_SOURCE;
706     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr);
707   }
708 
709   /* receive work done on other processors */
710   {
711     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
712     PetscMPIInt idex;
713     PetscBT     table_i;
714     MPI_Status  *status2;
715 
716     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
717     for (i=0; i<nrqs; ++i) {
718       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRMPI(ierr);
719       /* Process the message */
720       rbuf2_i = rbuf2[idex];
721       ct1     = 2*rbuf2_i[0]+1;
722       jmax    = rbuf2[idex][0];
723       for (j=1; j<=jmax; j++) {
724         max     = rbuf2_i[2*j];
725         is_no   = rbuf2_i[2*j-1];
726         isz_i   = isz[is_no];
727         table_i = table[is_no];
728 #if defined(PETSC_USE_CTABLE)
729         table_data_i = table_data[is_no];
730 #else
731         data_i  = data[is_no];
732 #endif
733         for (k=0; k<max; k++,ct1++) {
734           row = rbuf2_i[ct1];
735           if (!PetscBTLookupSet(table_i,row)) {
736 #if defined(PETSC_USE_CTABLE)
737             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
738 #else
739             data_i[isz_i] = row;
740 #endif
741             isz_i++;
742           }
743         }
744         isz[is_no] = isz_i;
745       }
746     }
747 
748     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRMPI(ierr);}
749     ierr = PetscFree(status2);CHKERRQ(ierr);
750   }
751 
752 #if defined(PETSC_USE_CTABLE)
753   tcount_max = 0;
754   for (i=0; i<imax; ++i) {
755     table_data_i = table_data[i];
756     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
757     if (tcount_max < tcount) tcount_max = tcount;
758   }
759   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
760 #endif
761 
762   for (i=0; i<imax; ++i) {
763 #if defined(PETSC_USE_CTABLE)
764     PetscTablePosition tpos;
765     table_data_i = table_data[i];
766 
767     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
768     while (tpos) {
769       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
770       tdata[--j] = --k;
771     }
772     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
773 #else
774     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
775 #endif
776     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
777   }
778 
779   ierr = PetscFree(iscomms);CHKERRQ(ierr);
780   ierr = PetscFree(onodes2);CHKERRQ(ierr);
781   ierr = PetscFree(olengths2);CHKERRQ(ierr);
782 
783   ierr = PetscFree(pa);CHKERRQ(ierr);
784   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
785   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
786   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
787   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
788   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
789   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
790   ierr = PetscFree(s_status);CHKERRQ(ierr);
791   ierr = PetscFree(recv_status);CHKERRQ(ierr);
792   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
793   ierr = PetscFree(xdata);CHKERRQ(ierr);
794   ierr = PetscFree(isz1);CHKERRQ(ierr);
795 #if defined(PETSC_USE_CTABLE)
796   for (i=0; i<imax; i++) {
797     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
798   }
799   ierr = PetscFree(table_data);CHKERRQ(ierr);
800   ierr = PetscFree(tdata);CHKERRQ(ierr);
801   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
802 #else
803   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
804 #endif
805   PetscFunctionReturn(0);
806 }
807 
808 /*
809    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
810        the work on the local processor.
811 
812      Inputs:
813       C      - MAT_MPIAIJ;
814       imax - total no of index sets processed at a time;
815       table  - an array of char - size = m bits.
816 
817      Output:
818       isz    - array containing the count of the solution elements corresponding
819                to each index set;
820       data or table_data  - pointer to the solutions
821 */
822 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
823 {
824   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
825   Mat        A  = c->A,B = c->B;
826   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
827   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
828   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
829   PetscBT    table_i;
830 #if defined(PETSC_USE_CTABLE)
831   PetscTable         table_data_i;
832   PetscErrorCode     ierr;
833   PetscTablePosition tpos;
834   PetscInt           tcount,*tdata;
835 #else
836   PetscInt           *data_i;
837 #endif
838 
839   PetscFunctionBegin;
840   rstart = C->rmap->rstart;
841   cstart = C->cmap->rstart;
842   ai     = a->i;
843   aj     = a->j;
844   bi     = b->i;
845   bj     = b->j;
846   garray = c->garray;
847 
848   for (i=0; i<imax; i++) {
849 #if defined(PETSC_USE_CTABLE)
850     /* copy existing entries of table_data_i into tdata[] */
851     table_data_i = table_data[i];
852     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
853     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
854 
855     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
856     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
857     while (tpos) {
858       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
859       tdata[--j] = --row;
860       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
861     }
862 #else
863     data_i  = data[i];
864 #endif
865     table_i = table[i];
866     isz_i   = isz[i];
867     max     = isz[i];
868 
869     for (j=0; j<max; j++) {
870 #if defined(PETSC_USE_CTABLE)
871       row   = tdata[j] - rstart;
872 #else
873       row   = data_i[j] - rstart;
874 #endif
875       start = ai[row];
876       end   = ai[row+1];
877       for (k=start; k<end; k++) { /* Amat */
878         val = aj[k] + cstart;
879         if (!PetscBTLookupSet(table_i,val)) {
880 #if defined(PETSC_USE_CTABLE)
881           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
882 #else
883           data_i[isz_i] = val;
884 #endif
885           isz_i++;
886         }
887       }
888       start = bi[row];
889       end   = bi[row+1];
890       for (k=start; k<end; k++) { /* Bmat */
891         val = garray[bj[k]];
892         if (!PetscBTLookupSet(table_i,val)) {
893 #if defined(PETSC_USE_CTABLE)
894           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
895 #else
896           data_i[isz_i] = val;
897 #endif
898           isz_i++;
899         }
900       }
901     }
902     isz[i] = isz_i;
903 
904 #if defined(PETSC_USE_CTABLE)
905     ierr = PetscFree(tdata);CHKERRQ(ierr);
906 #endif
907   }
908   PetscFunctionReturn(0);
909 }
910 
911 /*
912       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
913          and return the output
914 
915          Input:
916            C    - the matrix
917            nrqr - no of messages being processed.
918            rbuf - an array of pointers to the received requests
919 
920          Output:
921            xdata - array of messages to be sent back
922            isz1  - size of each message
923 
924   For better efficiency perhaps we should malloc separately each xdata[i],
925 then if a remalloc is required we need only copy the data for that one row
926 rather then all previous rows as it is now where a single large chunck of
927 memory is used.
928 
929 */
930 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
931 {
932   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
933   Mat            A  = c->A,B = c->B;
934   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
935   PetscErrorCode ierr;
936   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
937   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
938   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
939   PetscInt       *rbuf_i,kmax,rbuf_0;
940   PetscBT        xtable;
941 
942   PetscFunctionBegin;
943   m      = C->rmap->N;
944   rstart = C->rmap->rstart;
945   cstart = C->cmap->rstart;
946   ai     = a->i;
947   aj     = a->j;
948   bi     = b->i;
949   bj     = b->j;
950   garray = c->garray;
951 
952 
953   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
954     rbuf_i =  rbuf[i];
955     rbuf_0 =  rbuf_i[0];
956     ct    += rbuf_0;
957     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
958   }
959 
960   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
961   else max1 = 1;
962   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
963   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
964   ++no_malloc;
965   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
966   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
967 
968   ct3 = 0;
969   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
970     rbuf_i =  rbuf[i];
971     rbuf_0 =  rbuf_i[0];
972     ct1    =  2*rbuf_0+1;
973     ct2    =  ct1;
974     ct3   += ct1;
975     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
976       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
977       oct2 = ct2;
978       kmax = rbuf_i[2*j];
979       for (k=0; k<kmax; k++,ct1++) {
980         row = rbuf_i[ct1];
981         if (!PetscBTLookupSet(xtable,row)) {
982           if (!(ct3 < mem_estimate)) {
983             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
984             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
985             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
986             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
987             xdata[0]     = tmp;
988             mem_estimate = new_estimate; ++no_malloc;
989             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
990           }
991           xdata[i][ct2++] = row;
992           ct3++;
993         }
994       }
995       for (k=oct2,max2=ct2; k<max2; k++) {
996         row   = xdata[i][k] - rstart;
997         start = ai[row];
998         end   = ai[row+1];
999         for (l=start; l<end; l++) {
1000           val = aj[l] + cstart;
1001           if (!PetscBTLookupSet(xtable,val)) {
1002             if (!(ct3 < mem_estimate)) {
1003               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1004               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1005               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1006               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1007               xdata[0]     = tmp;
1008               mem_estimate = new_estimate; ++no_malloc;
1009               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1010             }
1011             xdata[i][ct2++] = val;
1012             ct3++;
1013           }
1014         }
1015         start = bi[row];
1016         end   = bi[row+1];
1017         for (l=start; l<end; l++) {
1018           val = garray[bj[l]];
1019           if (!PetscBTLookupSet(xtable,val)) {
1020             if (!(ct3 < mem_estimate)) {
1021               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1022               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1023               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1024               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1025               xdata[0]     = tmp;
1026               mem_estimate = new_estimate; ++no_malloc;
1027               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1028             }
1029             xdata[i][ct2++] = val;
1030             ct3++;
1031           }
1032         }
1033       }
1034       /* Update the header*/
1035       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1036       xdata[i][2*j-1] = rbuf_i[2*j-1];
1037     }
1038     xdata[i][0] = rbuf_0;
1039     xdata[i+1]  = xdata[i] + ct2;
1040     isz1[i]     = ct2; /* size of each message */
1041   }
1042   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1043   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1044   PetscFunctionReturn(0);
1045 }
1046 /* -------------------------------------------------------------------------*/
1047 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1048 /*
1049     Every processor gets the entire matrix
1050 */
1051 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1052 {
1053   Mat            B;
1054   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1055   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1056   PetscErrorCode ierr;
1057   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1058   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1059   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1060 
1061   PetscFunctionBegin;
1062   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1063   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr);
1064   if (scall == MAT_INITIAL_MATRIX) {
1065     /* ----------------------------------------------------------------
1066          Tell every processor the number of nonzeros per row
1067     */
1068     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1069     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1070       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1071     }
1072     ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1073     for (i=0; i<size; i++) {
1074       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1075       displs[i]     = A->rmap->range[i];
1076     }
1077 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1078     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1079 #else
1080     sendcount = A->rmap->rend - A->rmap->rstart;
1081     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1082 #endif
1083     /* ---------------------------------------------------------------
1084          Create the sequential matrix of the same type as the local block diagonal
1085     */
1086     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1087     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1088     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1089     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1090     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1091     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1092     **Bin = B;
1093     b     = (Mat_SeqAIJ*)B->data;
1094 
1095     /*--------------------------------------------------------------------
1096        Copy my part of matrix column indices over
1097     */
1098     sendcount  = ad->nz + bd->nz;
1099     jsendbuf   = b->j + b->i[rstarts[rank]];
1100     a_jsendbuf = ad->j;
1101     b_jsendbuf = bd->j;
1102     n          = A->rmap->rend - A->rmap->rstart;
1103     cnt        = 0;
1104     for (i=0; i<n; i++) {
1105       /* put in lower diagonal portion */
1106       m = bd->i[i+1] - bd->i[i];
1107       while (m > 0) {
1108         /* is it above diagonal (in bd (compressed) numbering) */
1109         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1110         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1111         m--;
1112       }
1113 
1114       /* put in diagonal portion */
1115       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1116         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1117       }
1118 
1119       /* put in upper diagonal portion */
1120       while (m-- > 0) {
1121         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1122       }
1123     }
1124     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1125 
1126     /*--------------------------------------------------------------------
1127        Gather all column indices to all processors
1128     */
1129     for (i=0; i<size; i++) {
1130       recvcounts[i] = 0;
1131       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1132         recvcounts[i] += lens[j];
1133       }
1134     }
1135     displs[0] = 0;
1136     for (i=1; i<size; i++) {
1137       displs[i] = displs[i-1] + recvcounts[i-1];
1138     }
1139 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1140     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1141 #else
1142     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1143 #endif
1144     /*--------------------------------------------------------------------
1145         Assemble the matrix into useable form (note numerical values not yet set)
1146     */
1147     /* set the b->ilen (length of each row) values */
1148     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1149     /* set the b->i indices */
1150     b->i[0] = 0;
1151     for (i=1; i<=A->rmap->N; i++) {
1152       b->i[i] = b->i[i-1] + lens[i-1];
1153     }
1154     ierr = PetscFree(lens);CHKERRQ(ierr);
1155     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1156     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1157 
1158   } else {
1159     B = **Bin;
1160     b = (Mat_SeqAIJ*)B->data;
1161   }
1162 
1163   /*--------------------------------------------------------------------
1164        Copy my part of matrix numerical values into the values location
1165   */
1166   if (flag == MAT_GET_VALUES) {
1167     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1168     MatScalar         *sendbuf,*recvbuf;
1169 
1170     ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr);
1171     ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr);
1172     sendcount = ad->nz + bd->nz;
1173     sendbuf   = b->a + b->i[rstarts[rank]];
1174     a_sendbuf = ada;
1175     b_sendbuf = bda;
1176     b_sendj   = bd->j;
1177     n         = A->rmap->rend - A->rmap->rstart;
1178     cnt       = 0;
1179     for (i=0; i<n; i++) {
1180       /* put in lower diagonal portion */
1181       m = bd->i[i+1] - bd->i[i];
1182       while (m > 0) {
1183         /* is it above diagonal (in bd (compressed) numbering) */
1184         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1185         sendbuf[cnt++] = *b_sendbuf++;
1186         m--;
1187         b_sendj++;
1188       }
1189 
1190       /* put in diagonal portion */
1191       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1192         sendbuf[cnt++] = *a_sendbuf++;
1193       }
1194 
1195       /* put in upper diagonal portion */
1196       while (m-- > 0) {
1197         sendbuf[cnt++] = *b_sendbuf++;
1198         b_sendj++;
1199       }
1200     }
1201     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1202 
1203     /* -----------------------------------------------------------------
1204        Gather all numerical values to all processors
1205     */
1206     if (!recvcounts) {
1207       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1208     }
1209     for (i=0; i<size; i++) {
1210       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1211     }
1212     displs[0] = 0;
1213     for (i=1; i<size; i++) {
1214       displs[i] = displs[i-1] + recvcounts[i-1];
1215     }
1216     recvbuf = b->a;
1217 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1218     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1219 #else
1220     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1221 #endif
1222     ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr);
1223     ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr);
1224   }  /* endof (flag == MAT_GET_VALUES) */
1225   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1226 
1227   if (A->symmetric) {
1228     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1229   } else if (A->hermitian) {
1230     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1231   } else if (A->structurally_symmetric) {
1232     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1233   }
1234   PetscFunctionReturn(0);
1235 }
1236 
1237 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1238 {
1239   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1240   Mat            submat,A = c->A,B = c->B;
1241   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1242   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1243   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1244   const PetscInt *icol,*irow;
1245   PetscInt       nrow,ncol,start;
1246   PetscErrorCode ierr;
1247   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1248   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1249   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1250   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1251   PetscInt       *lens,rmax,ncols,*cols,Crow;
1252 #if defined(PETSC_USE_CTABLE)
1253   PetscTable     cmap,rmap;
1254   PetscInt       *cmap_loc,*rmap_loc;
1255 #else
1256   PetscInt       *cmap,*rmap;
1257 #endif
1258   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1259   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1260   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1261   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1262   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1263   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1264   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1265   MPI_Comm       comm;
1266   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1267   PetscMPIInt    *onodes1,*olengths1,idex,end;
1268   Mat_SubSppt    *smatis1;
1269   PetscBool      isrowsorted,iscolsorted;
1270 
1271   PetscFunctionBegin;
1272   PetscValidLogicalCollectiveInt(C,ismax,2);
1273   PetscValidLogicalCollectiveEnum(C,scall,5);
1274   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1275   ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1276   ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1277   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1278   size = c->size;
1279   rank = c->rank;
1280 
1281   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1282   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1283   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1284   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1285   if (allcolumns) {
1286     icol = NULL;
1287     ncol = C->cmap->N;
1288   } else {
1289     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1290     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1291   }
1292 
1293   if (scall == MAT_INITIAL_MATRIX) {
1294     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1295 
1296     /* Get some new tags to keep the communication clean */
1297     tag1 = ((PetscObject)C)->tag;
1298     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1299     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1300 
1301     /* evaluate communication - mesg to who, length of mesg, and buffer space
1302      required. Based on this, buffers are allocated, and data copied into them */
1303     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1304     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1305 
1306     /* w1[proc] = num of rows owned by proc -- to be requested */
1307     proc = 0;
1308     nrqs = 0; /* num of outgoing messages */
1309     for (j=0; j<nrow; j++) {
1310       row  = irow[j];
1311       if (!isrowsorted) proc = 0;
1312       while (row >= C->rmap->range[proc+1]) proc++;
1313       w1[proc]++;
1314       row2proc[j] = proc; /* map row index to proc */
1315 
1316       if (proc != rank && !w2[proc]) {
1317         w2[proc] = 1; nrqs++;
1318       }
1319     }
1320     w1[rank] = 0;  /* rows owned by self will not be requested */
1321 
1322     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1323     for (proc=0,j=0; proc<size; proc++) {
1324       if (w1[proc]) { pa[j++] = proc;}
1325     }
1326 
1327     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1328     msz = 0;              /* total mesg length (for all procs) */
1329     for (i=0; i<nrqs; i++) {
1330       proc      = pa[i];
1331       w1[proc] += 3;
1332       msz      += w1[proc];
1333     }
1334     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1335 
1336     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1337     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1338     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1339 
1340     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1341        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1342     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1343 
1344     /* Now post the Irecvs corresponding to these messages */
1345     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1346 
1347     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1348     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1349 
1350     /* Allocate Memory for outgoing messages */
1351     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1352     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1353     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1354 
1355     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1356     iptr = tmp;
1357     for (i=0; i<nrqs; i++) {
1358       proc        = pa[i];
1359       sbuf1[proc] = iptr;
1360       iptr       += w1[proc];
1361     }
1362 
1363     /* Form the outgoing messages */
1364     /* Initialize the header space */
1365     for (i=0; i<nrqs; i++) {
1366       proc      = pa[i];
1367       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1368       ptr[proc] = sbuf1[proc] + 3;
1369     }
1370 
1371     /* Parse the isrow and copy data into outbuf */
1372     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1373     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1374       proc = row2proc[j];
1375       if (proc != rank) { /* copy to the outgoing buf*/
1376         *ptr[proc] = irow[j];
1377         ctr[proc]++; ptr[proc]++;
1378       }
1379     }
1380 
1381     /* Update the headers for the current IS */
1382     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1383       if ((ctr_j = ctr[j])) {
1384         sbuf1_j        = sbuf1[j];
1385         k              = ++sbuf1_j[0];
1386         sbuf1_j[2*k]   = ctr_j;
1387         sbuf1_j[2*k-1] = 0;
1388       }
1389     }
1390 
1391     /* Now post the sends */
1392     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1393     for (i=0; i<nrqs; ++i) {
1394       proc = pa[i];
1395       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
1396     }
1397 
1398     /* Post Receives to capture the buffer size */
1399     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1400     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1401 
1402     rbuf2[0] = tmp + msz;
1403     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1404 
1405     for (i=0; i<nrqs; ++i) {
1406       proc = pa[i];
1407       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
1408     }
1409 
1410     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1411 
1412     /* Send to other procs the buf size they should allocate */
1413     /* Receive messages*/
1414     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1415     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1416 
1417     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
1418     for (i=0; i<nrqr; ++i) {
1419       req_size[i] = 0;
1420       rbuf1_i        = rbuf1[i];
1421       start          = 2*rbuf1_i[0] + 1;
1422       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1423       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1424       sbuf2_i        = sbuf2[i];
1425       for (j=start; j<end; j++) {
1426         k            = rbuf1_i[j] - rstart;
1427         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1428         sbuf2_i[j]   = ncols;
1429         req_size[i] += ncols;
1430       }
1431       req_source1[i] = r_status1[i].MPI_SOURCE;
1432 
1433       /* form the header */
1434       sbuf2_i[0] = req_size[i];
1435       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1436 
1437       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
1438     }
1439 
1440     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1441     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1442 
1443     /* rbuf2 is received, Post recv column indices a->j */
1444     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
1445 
1446     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1447     for (i=0; i<nrqs; ++i) {
1448       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1449       req_source2[i] = r_status2[i].MPI_SOURCE;
1450       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
1451     }
1452 
1453     /* Wait on sends1 and sends2 */
1454     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1455     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);
1456     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1457     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1458 
1459     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);
1460     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1461 
1462     /* Now allocate sending buffers for a->j, and send them off */
1463     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1464     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1465     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1466     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1467 
1468     for (i=0; i<nrqr; i++) { /* for each requested message */
1469       rbuf1_i   = rbuf1[i];
1470       sbuf_aj_i = sbuf_aj[i];
1471       ct1       = 2*rbuf1_i[0] + 1;
1472       ct2       = 0;
1473       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1474 
1475       kmax = rbuf1[i][2];
1476       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1477         row    = rbuf1_i[ct1] - rstart;
1478         nzA    = ai[row+1] - ai[row];
1479         nzB    = bi[row+1] - bi[row];
1480         ncols  = nzA + nzB;
1481         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1482 
1483         /* load the column indices for this row into cols*/
1484         cols = sbuf_aj_i + ct2;
1485 
1486         lwrite = 0;
1487         for (l=0; l<nzB; l++) {
1488           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1489         }
1490         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1491         for (l=0; l<nzB; l++) {
1492           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1493         }
1494 
1495         ct2 += ncols;
1496       }
1497       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
1498     }
1499 
1500     /* create column map (cmap): global col of C -> local col of submat */
1501 #if defined(PETSC_USE_CTABLE)
1502     if (!allcolumns) {
1503       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1504       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1505       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1506         if (icol[j] >= cstart && icol[j] <cend) {
1507           cmap_loc[icol[j] - cstart] = j+1;
1508         } else { /* use PetscTable for non-local col indices */
1509           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1510         }
1511       }
1512     } else {
1513       cmap     = NULL;
1514       cmap_loc = NULL;
1515     }
1516     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1517 #else
1518     if (!allcolumns) {
1519       ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1520       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1521     } else {
1522       cmap = NULL;
1523     }
1524 #endif
1525 
1526     /* Create lens for MatSeqAIJSetPreallocation() */
1527     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1528 
1529     /* Compute lens from local part of C */
1530     for (j=0; j<nrow; j++) {
1531       row  = irow[j];
1532       proc = row2proc[j];
1533       if (proc == rank) {
1534         /* diagonal part A = c->A */
1535         ncols = ai[row-rstart+1] - ai[row-rstart];
1536         cols  = aj + ai[row-rstart];
1537         if (!allcolumns) {
1538           for (k=0; k<ncols; k++) {
1539 #if defined(PETSC_USE_CTABLE)
1540             tcol = cmap_loc[cols[k]];
1541 #else
1542             tcol = cmap[cols[k]+cstart];
1543 #endif
1544             if (tcol) lens[j]++;
1545           }
1546         } else { /* allcolumns */
1547           lens[j] = ncols;
1548         }
1549 
1550         /* off-diagonal part B = c->B */
1551         ncols = bi[row-rstart+1] - bi[row-rstart];
1552         cols  = bj + bi[row-rstart];
1553         if (!allcolumns) {
1554           for (k=0; k<ncols; k++) {
1555 #if defined(PETSC_USE_CTABLE)
1556             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1557 #else
1558             tcol = cmap[bmap[cols[k]]];
1559 #endif
1560             if (tcol) lens[j]++;
1561           }
1562         } else { /* allcolumns */
1563           lens[j] += ncols;
1564         }
1565       }
1566     }
1567 
1568     /* Create row map (rmap): global row of C -> local row of submat */
1569 #if defined(PETSC_USE_CTABLE)
1570     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1571     for (j=0; j<nrow; j++) {
1572       row  = irow[j];
1573       proc = row2proc[j];
1574       if (proc == rank) { /* a local row */
1575         rmap_loc[row - rstart] = j;
1576       } else {
1577         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1578       }
1579     }
1580 #else
1581     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1582     for (j=0; j<nrow; j++) {
1583       rmap[irow[j]] = j;
1584     }
1585 #endif
1586 
1587     /* Update lens from offproc data */
1588     /* recv a->j is done */
1589     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1590     for (i=0; i<nrqs; i++) {
1591       proc    = pa[i];
1592       sbuf1_i = sbuf1[proc];
1593       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1594       ct1     = 2 + 1;
1595       ct2     = 0;
1596       rbuf2_i = rbuf2[i]; /* received length of C->j */
1597       rbuf3_i = rbuf3[i]; /* received C->j */
1598 
1599       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1600       max1   = sbuf1_i[2];
1601       for (k=0; k<max1; k++,ct1++) {
1602 #if defined(PETSC_USE_CTABLE)
1603         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1604         row--;
1605         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1606 #else
1607         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1608 #endif
1609         /* Now, store row index of submat in sbuf1_i[ct1] */
1610         sbuf1_i[ct1] = row;
1611 
1612         nnz = rbuf2_i[ct1];
1613         if (!allcolumns) {
1614           for (l=0; l<nnz; l++,ct2++) {
1615 #if defined(PETSC_USE_CTABLE)
1616             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1617               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1618             } else {
1619               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1620             }
1621 #else
1622             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1623 #endif
1624             if (tcol) lens[row]++;
1625           }
1626         } else { /* allcolumns */
1627           lens[row] += nnz;
1628         }
1629       }
1630     }
1631     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);
1632     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1633 
1634     /* Create the submatrices */
1635     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1636     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1637 
1638     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1639     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1640     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1641     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1642     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1643 
1644     /* create struct Mat_SubSppt and attached it to submat */
1645     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1646     subc = (Mat_SeqAIJ*)submat->data;
1647     subc->submatis1 = smatis1;
1648 
1649     smatis1->id          = 0;
1650     smatis1->nrqs        = nrqs;
1651     smatis1->nrqr        = nrqr;
1652     smatis1->rbuf1       = rbuf1;
1653     smatis1->rbuf2       = rbuf2;
1654     smatis1->rbuf3       = rbuf3;
1655     smatis1->sbuf2       = sbuf2;
1656     smatis1->req_source2 = req_source2;
1657 
1658     smatis1->sbuf1       = sbuf1;
1659     smatis1->ptr         = ptr;
1660     smatis1->tmp         = tmp;
1661     smatis1->ctr         = ctr;
1662 
1663     smatis1->pa           = pa;
1664     smatis1->req_size     = req_size;
1665     smatis1->req_source1  = req_source1;
1666 
1667     smatis1->allcolumns  = allcolumns;
1668     smatis1->singleis    = PETSC_TRUE;
1669     smatis1->row2proc    = row2proc;
1670     smatis1->rmap        = rmap;
1671     smatis1->cmap        = cmap;
1672 #if defined(PETSC_USE_CTABLE)
1673     smatis1->rmap_loc    = rmap_loc;
1674     smatis1->cmap_loc    = cmap_loc;
1675 #endif
1676 
1677     smatis1->destroy     = submat->ops->destroy;
1678     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1679     submat->factortype   = C->factortype;
1680 
1681     /* compute rmax */
1682     rmax = 0;
1683     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1684 
1685   } else { /* scall == MAT_REUSE_MATRIX */
1686     submat = submats[0];
1687     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1688 
1689     subc    = (Mat_SeqAIJ*)submat->data;
1690     rmax    = subc->rmax;
1691     smatis1 = subc->submatis1;
1692     nrqs        = smatis1->nrqs;
1693     nrqr        = smatis1->nrqr;
1694     rbuf1       = smatis1->rbuf1;
1695     rbuf2       = smatis1->rbuf2;
1696     rbuf3       = smatis1->rbuf3;
1697     req_source2 = smatis1->req_source2;
1698 
1699     sbuf1     = smatis1->sbuf1;
1700     sbuf2     = smatis1->sbuf2;
1701     ptr       = smatis1->ptr;
1702     tmp       = smatis1->tmp;
1703     ctr       = smatis1->ctr;
1704 
1705     pa         = smatis1->pa;
1706     req_size   = smatis1->req_size;
1707     req_source1 = smatis1->req_source1;
1708 
1709     allcolumns = smatis1->allcolumns;
1710     row2proc   = smatis1->row2proc;
1711     rmap       = smatis1->rmap;
1712     cmap       = smatis1->cmap;
1713 #if defined(PETSC_USE_CTABLE)
1714     rmap_loc   = smatis1->rmap_loc;
1715     cmap_loc   = smatis1->cmap_loc;
1716 #endif
1717   }
1718 
1719   /* Post recv matrix values */
1720   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1721   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1722   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1723   for (i=0; i<nrqs; ++i) {
1724     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1725     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
1726   }
1727 
1728   /* Allocate sending buffers for a->a, and send them off */
1729   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1730   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1731   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1732   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1733 
1734   for (i=0; i<nrqr; i++) {
1735     rbuf1_i   = rbuf1[i];
1736     sbuf_aa_i = sbuf_aa[i];
1737     ct1       = 2*rbuf1_i[0]+1;
1738     ct2       = 0;
1739     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1740 
1741     kmax = rbuf1_i[2];
1742     for (k=0; k<kmax; k++,ct1++) {
1743       row = rbuf1_i[ct1] - rstart;
1744       nzA = ai[row+1] - ai[row];
1745       nzB = bi[row+1] - bi[row];
1746       ncols  = nzA + nzB;
1747       cworkB = bj + bi[row];
1748       vworkA = a_a + ai[row];
1749       vworkB = b_a + bi[row];
1750 
1751       /* load the column values for this row into vals*/
1752       vals = sbuf_aa_i + ct2;
1753 
1754       lwrite = 0;
1755       for (l=0; l<nzB; l++) {
1756         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1757       }
1758       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1759       for (l=0; l<nzB; l++) {
1760         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1761       }
1762 
1763       ct2 += ncols;
1764     }
1765     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
1766   }
1767 
1768   /* Assemble submat */
1769   /* First assemble the local rows */
1770   for (j=0; j<nrow; j++) {
1771     row  = irow[j];
1772     proc = row2proc[j];
1773     if (proc == rank) {
1774       Crow = row - rstart;  /* local row index of C */
1775 #if defined(PETSC_USE_CTABLE)
1776       row = rmap_loc[Crow]; /* row index of submat */
1777 #else
1778       row = rmap[row];
1779 #endif
1780 
1781       if (allcolumns) {
1782         /* diagonal part A = c->A */
1783         ncols = ai[Crow+1] - ai[Crow];
1784         cols  = aj + ai[Crow];
1785         vals  = a_a + ai[Crow];
1786         i     = 0;
1787         for (k=0; k<ncols; k++) {
1788           subcols[i]   = cols[k] + cstart;
1789           subvals[i++] = vals[k];
1790         }
1791 
1792         /* off-diagonal part B = c->B */
1793         ncols = bi[Crow+1] - bi[Crow];
1794         cols  = bj + bi[Crow];
1795         vals  = b_a + bi[Crow];
1796         for (k=0; k<ncols; k++) {
1797           subcols[i]   = bmap[cols[k]];
1798           subvals[i++] = vals[k];
1799         }
1800 
1801         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1802 
1803       } else { /* !allcolumns */
1804 #if defined(PETSC_USE_CTABLE)
1805         /* diagonal part A = c->A */
1806         ncols = ai[Crow+1] - ai[Crow];
1807         cols  = aj + ai[Crow];
1808         vals  = a_a + ai[Crow];
1809         i     = 0;
1810         for (k=0; k<ncols; k++) {
1811           tcol = cmap_loc[cols[k]];
1812           if (tcol) {
1813             subcols[i]   = --tcol;
1814             subvals[i++] = vals[k];
1815           }
1816         }
1817 
1818         /* off-diagonal part B = c->B */
1819         ncols = bi[Crow+1] - bi[Crow];
1820         cols  = bj + bi[Crow];
1821         vals  = b_a + bi[Crow];
1822         for (k=0; k<ncols; k++) {
1823           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1824           if (tcol) {
1825             subcols[i]   = --tcol;
1826             subvals[i++] = vals[k];
1827           }
1828         }
1829 #else
1830         /* diagonal part A = c->A */
1831         ncols = ai[Crow+1] - ai[Crow];
1832         cols  = aj + ai[Crow];
1833         vals  = a_a + ai[Crow];
1834         i     = 0;
1835         for (k=0; k<ncols; k++) {
1836           tcol = cmap[cols[k]+cstart];
1837           if (tcol) {
1838             subcols[i]   = --tcol;
1839             subvals[i++] = vals[k];
1840           }
1841         }
1842 
1843         /* off-diagonal part B = c->B */
1844         ncols = bi[Crow+1] - bi[Crow];
1845         cols  = bj + bi[Crow];
1846         vals  = b_a + bi[Crow];
1847         for (k=0; k<ncols; k++) {
1848           tcol = cmap[bmap[cols[k]]];
1849           if (tcol) {
1850             subcols[i]   = --tcol;
1851             subvals[i++] = vals[k];
1852           }
1853         }
1854 #endif
1855         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1856       }
1857     }
1858   }
1859 
1860   /* Now assemble the off-proc rows */
1861   for (i=0; i<nrqs; i++) { /* for each requested message */
1862     /* recv values from other processes */
1863     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1864     proc    = pa[idex];
1865     sbuf1_i = sbuf1[proc];
1866     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1867     ct1     = 2 + 1;
1868     ct2     = 0; /* count of received C->j */
1869     ct3     = 0; /* count of received C->j that will be inserted into submat */
1870     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1871     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1872     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1873 
1874     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1875     max1 = sbuf1_i[2];             /* num of rows */
1876     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1877       row = sbuf1_i[ct1]; /* row index of submat */
1878       if (!allcolumns) {
1879         idex = 0;
1880         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1881           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1882           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1883 #if defined(PETSC_USE_CTABLE)
1884             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1885               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1886             } else {
1887               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1888             }
1889 #else
1890             tcol = cmap[rbuf3_i[ct2]];
1891 #endif
1892             if (tcol) {
1893               subcols[idex]   = --tcol; /* may not be sorted */
1894               subvals[idex++] = rbuf4_i[ct2];
1895 
1896               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1897                For reuse, we replace received C->j with index that should be inserted to submat */
1898               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1899             }
1900           }
1901           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1902         } else { /* scall == MAT_REUSE_MATRIX */
1903           submat = submats[0];
1904           subc   = (Mat_SeqAIJ*)submat->data;
1905 
1906           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1907           for (l=0; l<nnz; l++) {
1908             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1909             subvals[idex++] = rbuf4_i[ct2];
1910           }
1911 
1912           bj = subc->j + subc->i[row]; /* sorted column indices */
1913           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1914         }
1915       } else { /* allcolumns */
1916         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1917         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1918         ct2 += nnz;
1919       }
1920     }
1921   }
1922 
1923   /* sending a->a are done */
1924   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);
1925   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1926 
1927   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1928   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1929   submats[0] = submat;
1930 
1931   /* Restore the indices */
1932   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1933   if (!allcolumns) {
1934     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1935   }
1936 
1937   /* Destroy allocated memory */
1938   for (i=0; i<nrqs; ++i) {
1939     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1940   }
1941   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1942   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1943   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1944 
1945   if (scall == MAT_INITIAL_MATRIX) {
1946     ierr = PetscFree(lens);CHKERRQ(ierr);
1947     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1948     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1949   }
1950   ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1951   ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 
1955 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1956 {
1957   PetscErrorCode ierr;
1958   PetscInt       ncol;
1959   PetscBool      colflag,allcolumns=PETSC_FALSE;
1960 
1961   PetscFunctionBegin;
1962   /* Allocate memory to hold all the submatrices */
1963   if (scall == MAT_INITIAL_MATRIX) {
1964     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1965   }
1966 
1967   /* Check for special case: each processor gets entire matrix columns */
1968   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1969   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1970   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1971 
1972   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1973   PetscFunctionReturn(0);
1974 }
1975 
1976 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1977 {
1978   PetscErrorCode ierr;
1979   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1980   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1981   Mat_SeqAIJ     *subc;
1982   Mat_SubSppt    *smat;
1983 
1984   PetscFunctionBegin;
1985   /* Check for special case: each processor has a single IS */
1986   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1987     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1988     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1989     PetscFunctionReturn(0);
1990   }
1991 
1992   /* Collect global wantallmatrix and nstages */
1993   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1994   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1995   if (!nmax) nmax = 1;
1996 
1997   if (scall == MAT_INITIAL_MATRIX) {
1998     /* Collect global wantallmatrix and nstages */
1999     if (ismax == 1 && C->rmap->N == C->cmap->N) {
2000       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2001       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2002       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2003       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2004       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2005         wantallmatrix = PETSC_TRUE;
2006 
2007         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2008       }
2009     }
2010 
2011     /* Determine the number of stages through which submatrices are done
2012        Each stage will extract nmax submatrices.
2013        nmax is determined by the matrix column dimension.
2014        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2015     */
2016     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2017 
2018     in[0] = -1*(PetscInt)wantallmatrix;
2019     in[1] = nstages;
2020     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2021     wantallmatrix = (PetscBool)(-out[0]);
2022     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2023 
2024   } else { /* MAT_REUSE_MATRIX */
2025     if (ismax) {
2026       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2027       smat = subc->submatis1;
2028     } else { /* (*submat)[0] is a dummy matrix */
2029       smat = (Mat_SubSppt*)(*submat)[0]->data;
2030     }
2031     if (!smat) {
2032       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2033       wantallmatrix = PETSC_TRUE;
2034     } else if (smat->singleis) {
2035       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2036       PetscFunctionReturn(0);
2037     } else {
2038       nstages = smat->nstages;
2039     }
2040   }
2041 
2042   if (wantallmatrix) {
2043     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2044     PetscFunctionReturn(0);
2045   }
2046 
2047   /* Allocate memory to hold all the submatrices and dummy submatrices */
2048   if (scall == MAT_INITIAL_MATRIX) {
2049     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2050   }
2051 
2052   for (i=0,pos=0; i<nstages; i++) {
2053     if (pos+nmax <= ismax) max_no = nmax;
2054     else if (pos >= ismax) max_no = 0;
2055     else                   max_no = ismax-pos;
2056 
2057     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2058     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2059       smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2060       smat->nstages = nstages;
2061     }
2062     pos += max_no;
2063   }
2064 
2065   if (ismax && scall == MAT_INITIAL_MATRIX) {
2066     /* save nstages for reuse */
2067     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2068     smat = subc->submatis1;
2069     smat->nstages = nstages;
2070   }
2071   PetscFunctionReturn(0);
2072 }
2073 
2074 /* -------------------------------------------------------------------------*/
2075 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2076 {
2077   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2078   Mat            A  = c->A;
2079   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2080   const PetscInt **icol,**irow;
2081   PetscInt       *nrow,*ncol,start;
2082   PetscErrorCode ierr;
2083   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2084   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2085   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2086   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2087   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2088 #if defined(PETSC_USE_CTABLE)
2089   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2090 #else
2091   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2092 #endif
2093   const PetscInt *irow_i;
2094   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2095   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2096   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2097   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2098   MPI_Status     *r_status3,*r_status4,*s_status4;
2099   MPI_Comm       comm;
2100   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2101   PetscMPIInt    *onodes1,*olengths1,end;
2102   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2103   Mat_SubSppt    *smat_i;
2104   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2105   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2106 
2107   PetscFunctionBegin;
2108   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2109   size = c->size;
2110   rank = c->rank;
2111 
2112   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2113   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2114 
2115   for (i=0; i<ismax; i++) {
2116     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2117     if (!issorted[i]) iscsorted = issorted[i];
2118 
2119     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2120 
2121     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2122     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2123 
2124     /* Check for special case: allcolumn */
2125     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2126     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2127     if (colflag && ncol[i] == C->cmap->N) {
2128       allcolumns[i] = PETSC_TRUE;
2129       icol[i] = NULL;
2130     } else {
2131       allcolumns[i] = PETSC_FALSE;
2132       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2133     }
2134   }
2135 
2136   if (scall == MAT_REUSE_MATRIX) {
2137     /* Assumes new rows are same length as the old rows */
2138     for (i=0; i<ismax; i++) {
2139       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2140       subc = (Mat_SeqAIJ*)submats[i]->data;
2141       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2142 
2143       /* Initial matrix as if empty */
2144       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2145 
2146       smat_i   = subc->submatis1;
2147 
2148       nrqs        = smat_i->nrqs;
2149       nrqr        = smat_i->nrqr;
2150       rbuf1       = smat_i->rbuf1;
2151       rbuf2       = smat_i->rbuf2;
2152       rbuf3       = smat_i->rbuf3;
2153       req_source2 = smat_i->req_source2;
2154 
2155       sbuf1     = smat_i->sbuf1;
2156       sbuf2     = smat_i->sbuf2;
2157       ptr       = smat_i->ptr;
2158       tmp       = smat_i->tmp;
2159       ctr       = smat_i->ctr;
2160 
2161       pa          = smat_i->pa;
2162       req_size    = smat_i->req_size;
2163       req_source1 = smat_i->req_source1;
2164 
2165       allcolumns[i] = smat_i->allcolumns;
2166       row2proc[i]   = smat_i->row2proc;
2167       rmap[i]       = smat_i->rmap;
2168       cmap[i]       = smat_i->cmap;
2169     }
2170 
2171     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2172       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2173       smat_i = (Mat_SubSppt*)submats[0]->data;
2174 
2175       nrqs        = smat_i->nrqs;
2176       nrqr        = smat_i->nrqr;
2177       rbuf1       = smat_i->rbuf1;
2178       rbuf2       = smat_i->rbuf2;
2179       rbuf3       = smat_i->rbuf3;
2180       req_source2 = smat_i->req_source2;
2181 
2182       sbuf1       = smat_i->sbuf1;
2183       sbuf2       = smat_i->sbuf2;
2184       ptr         = smat_i->ptr;
2185       tmp         = smat_i->tmp;
2186       ctr         = smat_i->ctr;
2187 
2188       pa          = smat_i->pa;
2189       req_size    = smat_i->req_size;
2190       req_source1 = smat_i->req_source1;
2191 
2192       allcolumns[0] = PETSC_FALSE;
2193     }
2194   } else { /* scall == MAT_INITIAL_MATRIX */
2195     /* Get some new tags to keep the communication clean */
2196     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2197     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2198 
2199     /* evaluate communication - mesg to who, length of mesg, and buffer space
2200      required. Based on this, buffers are allocated, and data copied into them*/
2201     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2202 
2203     for (i=0; i<ismax; i++) {
2204       jmax   = nrow[i];
2205       irow_i = irow[i];
2206 
2207       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2208       row2proc[i] = row2proc_i;
2209 
2210       if (issorted[i]) proc = 0;
2211       for (j=0; j<jmax; j++) {
2212         if (!issorted[i]) proc = 0;
2213         row = irow_i[j];
2214         while (row >= C->rmap->range[proc+1]) proc++;
2215         w4[proc]++;
2216         row2proc_i[j] = proc; /* map row index to proc */
2217       }
2218       for (j=0; j<size; j++) {
2219         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2220       }
2221     }
2222 
2223     nrqs     = 0;              /* no of outgoing messages */
2224     msz      = 0;              /* total mesg length (for all procs) */
2225     w1[rank] = 0;              /* no mesg sent to self */
2226     w3[rank] = 0;
2227     for (i=0; i<size; i++) {
2228       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2229     }
2230     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2231     for (i=0,j=0; i<size; i++) {
2232       if (w1[i]) { pa[j] = i; j++; }
2233     }
2234 
2235     /* Each message would have a header = 1 + 2*(no of IS) + data */
2236     for (i=0; i<nrqs; i++) {
2237       j      = pa[i];
2238       w1[j] += w2[j] + 2* w3[j];
2239       msz   += w1[j];
2240     }
2241     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2242 
2243     /* Determine the number of messages to expect, their lengths, from from-ids */
2244     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2245     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2246 
2247     /* Now post the Irecvs corresponding to these messages */
2248     tag0 = ((PetscObject)C)->tag;
2249     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2250 
2251     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2252     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2253 
2254     /* Allocate Memory for outgoing messages */
2255     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2256     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2257     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2258 
2259     {
2260       PetscInt *iptr = tmp;
2261       k    = 0;
2262       for (i=0; i<nrqs; i++) {
2263         j        = pa[i];
2264         iptr    += k;
2265         sbuf1[j] = iptr;
2266         k        = w1[j];
2267       }
2268     }
2269 
2270     /* Form the outgoing messages. Initialize the header space */
2271     for (i=0; i<nrqs; i++) {
2272       j           = pa[i];
2273       sbuf1[j][0] = 0;
2274       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2275       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2276     }
2277 
2278     /* Parse the isrow and copy data into outbuf */
2279     for (i=0; i<ismax; i++) {
2280       row2proc_i = row2proc[i];
2281       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2282       irow_i = irow[i];
2283       jmax   = nrow[i];
2284       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2285         proc = row2proc_i[j];
2286         if (proc != rank) { /* copy to the outgoing buf*/
2287           ctr[proc]++;
2288           *ptr[proc] = irow_i[j];
2289           ptr[proc]++;
2290         }
2291       }
2292       /* Update the headers for the current IS */
2293       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2294         if ((ctr_j = ctr[j])) {
2295           sbuf1_j        = sbuf1[j];
2296           k              = ++sbuf1_j[0];
2297           sbuf1_j[2*k]   = ctr_j;
2298           sbuf1_j[2*k-1] = i;
2299         }
2300       }
2301     }
2302 
2303     /*  Now  post the sends */
2304     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2305     for (i=0; i<nrqs; ++i) {
2306       j    = pa[i];
2307       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr);
2308     }
2309 
2310     /* Post Receives to capture the buffer size */
2311     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2312     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2313     rbuf2[0] = tmp + msz;
2314     for (i=1; i<nrqs; ++i) {
2315       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2316     }
2317     for (i=0; i<nrqs; ++i) {
2318       j    = pa[i];
2319       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
2320     }
2321 
2322     /* Send to other procs the buf size they should allocate */
2323     /* Receive messages*/
2324     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2325     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2326     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2327     {
2328       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2329       PetscInt   *sbuf2_i;
2330 
2331       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
2332       for (i=0; i<nrqr; ++i) {
2333         req_size[i] = 0;
2334         rbuf1_i        = rbuf1[i];
2335         start          = 2*rbuf1_i[0] + 1;
2336         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2337         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2338         sbuf2_i        = sbuf2[i];
2339         for (j=start; j<end; j++) {
2340           id              = rbuf1_i[j] - rstart;
2341           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2342           sbuf2_i[j]      = ncols;
2343           req_size[i] += ncols;
2344         }
2345         req_source1[i] = r_status1[i].MPI_SOURCE;
2346         /* form the header */
2347         sbuf2_i[0] = req_size[i];
2348         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2349 
2350         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
2351       }
2352     }
2353     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2354     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2355     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2356 
2357     /* Receive messages*/
2358     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2359     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2360 
2361     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
2362     for (i=0; i<nrqs; ++i) {
2363       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2364       req_source2[i] = r_status2[i].MPI_SOURCE;
2365       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
2366     }
2367     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2368     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2369 
2370     /* Wait on sends1 and sends2 */
2371     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2372     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2373 
2374     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);}
2375     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);}
2376     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2377     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2378     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2379     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2380 
2381     /* Now allocate sending buffers for a->j, and send them off */
2382     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2383     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2384     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2385     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2386 
2387     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2388     {
2389       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2390       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2391       PetscInt cend = C->cmap->rend;
2392       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2393 
2394       for (i=0; i<nrqr; i++) {
2395         rbuf1_i   = rbuf1[i];
2396         sbuf_aj_i = sbuf_aj[i];
2397         ct1       = 2*rbuf1_i[0] + 1;
2398         ct2       = 0;
2399         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2400           kmax = rbuf1[i][2*j];
2401           for (k=0; k<kmax; k++,ct1++) {
2402             row    = rbuf1_i[ct1] - rstart;
2403             nzA    = a_i[row+1] - a_i[row];
2404             nzB    = b_i[row+1] - b_i[row];
2405             ncols  = nzA + nzB;
2406             cworkA = a_j + a_i[row];
2407             cworkB = b_j + b_i[row];
2408 
2409             /* load the column indices for this row into cols */
2410             cols = sbuf_aj_i + ct2;
2411 
2412             lwrite = 0;
2413             for (l=0; l<nzB; l++) {
2414               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2415             }
2416             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2417             for (l=0; l<nzB; l++) {
2418               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2419             }
2420 
2421             ct2 += ncols;
2422           }
2423         }
2424         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
2425       }
2426     }
2427     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2428 
2429     /* create col map: global col of C -> local col of submatrices */
2430     {
2431       const PetscInt *icol_i;
2432 #if defined(PETSC_USE_CTABLE)
2433       for (i=0; i<ismax; i++) {
2434         if (!allcolumns[i]) {
2435           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2436 
2437           jmax   = ncol[i];
2438           icol_i = icol[i];
2439           cmap_i = cmap[i];
2440           for (j=0; j<jmax; j++) {
2441             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2442           }
2443         } else cmap[i] = NULL;
2444       }
2445 #else
2446       for (i=0; i<ismax; i++) {
2447         if (!allcolumns[i]) {
2448           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2449           jmax   = ncol[i];
2450           icol_i = icol[i];
2451           cmap_i = cmap[i];
2452           for (j=0; j<jmax; j++) {
2453             cmap_i[icol_i[j]] = j+1;
2454           }
2455         } else cmap[i] = NULL;
2456       }
2457 #endif
2458     }
2459 
2460     /* Create lens which is required for MatCreate... */
2461     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2462     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2463 
2464     if (ismax) {
2465       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2466     }
2467     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2468 
2469     /* Update lens from local data */
2470     for (i=0; i<ismax; i++) {
2471       row2proc_i = row2proc[i];
2472       jmax = nrow[i];
2473       if (!allcolumns[i]) cmap_i = cmap[i];
2474       irow_i = irow[i];
2475       lens_i = lens[i];
2476       for (j=0; j<jmax; j++) {
2477         row = irow_i[j];
2478         proc = row2proc_i[j];
2479         if (proc == rank) {
2480           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2481           if (!allcolumns[i]) {
2482             for (k=0; k<ncols; k++) {
2483 #if defined(PETSC_USE_CTABLE)
2484               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2485 #else
2486               tcol = cmap_i[cols[k]];
2487 #endif
2488               if (tcol) lens_i[j]++;
2489             }
2490           } else { /* allcolumns */
2491             lens_i[j] = ncols;
2492           }
2493           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2494         }
2495       }
2496     }
2497 
2498     /* Create row map: global row of C -> local row of submatrices */
2499 #if defined(PETSC_USE_CTABLE)
2500     for (i=0; i<ismax; i++) {
2501       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2502       irow_i = irow[i];
2503       jmax   = nrow[i];
2504       for (j=0; j<jmax; j++) {
2505       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2506       }
2507     }
2508 #else
2509     for (i=0; i<ismax; i++) {
2510       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2511       rmap_i = rmap[i];
2512       irow_i = irow[i];
2513       jmax   = nrow[i];
2514       for (j=0; j<jmax; j++) {
2515         rmap_i[irow_i[j]] = j;
2516       }
2517     }
2518 #endif
2519 
2520     /* Update lens from offproc data */
2521     {
2522       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2523 
2524       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2525       for (tmp2=0; tmp2<nrqs; tmp2++) {
2526         sbuf1_i = sbuf1[pa[tmp2]];
2527         jmax    = sbuf1_i[0];
2528         ct1     = 2*jmax+1;
2529         ct2     = 0;
2530         rbuf2_i = rbuf2[tmp2];
2531         rbuf3_i = rbuf3[tmp2];
2532         for (j=1; j<=jmax; j++) {
2533           is_no  = sbuf1_i[2*j-1];
2534           max1   = sbuf1_i[2*j];
2535           lens_i = lens[is_no];
2536           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2537           rmap_i = rmap[is_no];
2538           for (k=0; k<max1; k++,ct1++) {
2539 #if defined(PETSC_USE_CTABLE)
2540             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2541             row--;
2542             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2543 #else
2544             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2545 #endif
2546             max2 = rbuf2_i[ct1];
2547             for (l=0; l<max2; l++,ct2++) {
2548               if (!allcolumns[is_no]) {
2549 #if defined(PETSC_USE_CTABLE)
2550                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2551 #else
2552                 tcol = cmap_i[rbuf3_i[ct2]];
2553 #endif
2554                 if (tcol) lens_i[row]++;
2555               } else { /* allcolumns */
2556                 lens_i[row]++; /* lens_i[row] += max2 ? */
2557               }
2558             }
2559           }
2560         }
2561       }
2562     }
2563     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2564     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);}
2565     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2566     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2567 
2568     /* Create the submatrices */
2569     for (i=0; i<ismax; i++) {
2570       PetscInt    rbs,cbs;
2571 
2572       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2573       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2574 
2575       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2576       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2577 
2578       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2579       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2580       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2581 
2582       /* create struct Mat_SubSppt and attached it to submat */
2583       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2584       subc = (Mat_SeqAIJ*)submats[i]->data;
2585       subc->submatis1 = smat_i;
2586 
2587       smat_i->destroy          = submats[i]->ops->destroy;
2588       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2589       submats[i]->factortype   = C->factortype;
2590 
2591       smat_i->id          = i;
2592       smat_i->nrqs        = nrqs;
2593       smat_i->nrqr        = nrqr;
2594       smat_i->rbuf1       = rbuf1;
2595       smat_i->rbuf2       = rbuf2;
2596       smat_i->rbuf3       = rbuf3;
2597       smat_i->sbuf2       = sbuf2;
2598       smat_i->req_source2 = req_source2;
2599 
2600       smat_i->sbuf1       = sbuf1;
2601       smat_i->ptr         = ptr;
2602       smat_i->tmp         = tmp;
2603       smat_i->ctr         = ctr;
2604 
2605       smat_i->pa           = pa;
2606       smat_i->req_size     = req_size;
2607       smat_i->req_source1  = req_source1;
2608 
2609       smat_i->allcolumns  = allcolumns[i];
2610       smat_i->singleis    = PETSC_FALSE;
2611       smat_i->row2proc    = row2proc[i];
2612       smat_i->rmap        = rmap[i];
2613       smat_i->cmap        = cmap[i];
2614     }
2615 
2616     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2617       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2618       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2619       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2620 
2621       /* create struct Mat_SubSppt and attached it to submat */
2622       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2623       submats[0]->data = (void*)smat_i;
2624 
2625       smat_i->destroy          = submats[0]->ops->destroy;
2626       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2627       submats[0]->factortype   = C->factortype;
2628 
2629       smat_i->id          = 0;
2630       smat_i->nrqs        = nrqs;
2631       smat_i->nrqr        = nrqr;
2632       smat_i->rbuf1       = rbuf1;
2633       smat_i->rbuf2       = rbuf2;
2634       smat_i->rbuf3       = rbuf3;
2635       smat_i->sbuf2       = sbuf2;
2636       smat_i->req_source2 = req_source2;
2637 
2638       smat_i->sbuf1       = sbuf1;
2639       smat_i->ptr         = ptr;
2640       smat_i->tmp         = tmp;
2641       smat_i->ctr         = ctr;
2642 
2643       smat_i->pa           = pa;
2644       smat_i->req_size     = req_size;
2645       smat_i->req_source1  = req_source1;
2646 
2647       smat_i->allcolumns  = PETSC_FALSE;
2648       smat_i->singleis    = PETSC_FALSE;
2649       smat_i->row2proc    = NULL;
2650       smat_i->rmap        = NULL;
2651       smat_i->cmap        = NULL;
2652     }
2653 
2654     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2655     ierr = PetscFree(lens);CHKERRQ(ierr);
2656     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2657     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2658 
2659   } /* endof scall == MAT_INITIAL_MATRIX */
2660 
2661   /* Post recv matrix values */
2662   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2663   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2664   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2665   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2666   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2667   for (i=0; i<nrqs; ++i) {
2668     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2669     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
2670   }
2671 
2672   /* Allocate sending buffers for a->a, and send them off */
2673   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2674   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2675   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2676   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2677 
2678   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2679   {
2680     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2681     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2682     PetscInt    cend   = C->cmap->rend;
2683     PetscInt    *b_j   = b->j;
2684     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2685 
2686     ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2687     ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2688     for (i=0; i<nrqr; i++) {
2689       rbuf1_i   = rbuf1[i];
2690       sbuf_aa_i = sbuf_aa[i];
2691       ct1       = 2*rbuf1_i[0]+1;
2692       ct2       = 0;
2693       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2694         kmax = rbuf1_i[2*j];
2695         for (k=0; k<kmax; k++,ct1++) {
2696           row    = rbuf1_i[ct1] - rstart;
2697           nzA    = a_i[row+1] - a_i[row];
2698           nzB    = b_i[row+1] - b_i[row];
2699           ncols  = nzA + nzB;
2700           cworkB = b_j + b_i[row];
2701           vworkA = a_a + a_i[row];
2702           vworkB = b_a + b_i[row];
2703 
2704           /* load the column values for this row into vals*/
2705           vals = sbuf_aa_i+ct2;
2706 
2707           lwrite = 0;
2708           for (l=0; l<nzB; l++) {
2709             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2710           }
2711           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2712           for (l=0; l<nzB; l++) {
2713             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2714           }
2715 
2716           ct2 += ncols;
2717         }
2718       }
2719       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
2720     }
2721     ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2722     ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2723   }
2724 
2725   /* Assemble the matrices */
2726   /* First assemble the local rows */
2727   for (i=0; i<ismax; i++) {
2728     row2proc_i = row2proc[i];
2729     subc      = (Mat_SeqAIJ*)submats[i]->data;
2730     imat_ilen = subc->ilen;
2731     imat_j    = subc->j;
2732     imat_i    = subc->i;
2733     imat_a    = subc->a;
2734 
2735     if (!allcolumns[i]) cmap_i = cmap[i];
2736     rmap_i = rmap[i];
2737     irow_i = irow[i];
2738     jmax   = nrow[i];
2739     for (j=0; j<jmax; j++) {
2740       row  = irow_i[j];
2741       proc = row2proc_i[j];
2742       if (proc == rank) {
2743         old_row = row;
2744 #if defined(PETSC_USE_CTABLE)
2745         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2746         row--;
2747 #else
2748         row = rmap_i[row];
2749 #endif
2750         ilen_row = imat_ilen[row];
2751         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2752         mat_i    = imat_i[row];
2753         mat_a    = imat_a + mat_i;
2754         mat_j    = imat_j + mat_i;
2755         if (!allcolumns[i]) {
2756           for (k=0; k<ncols; k++) {
2757 #if defined(PETSC_USE_CTABLE)
2758             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2759 #else
2760             tcol = cmap_i[cols[k]];
2761 #endif
2762             if (tcol) {
2763               *mat_j++ = tcol - 1;
2764               *mat_a++ = vals[k];
2765               ilen_row++;
2766             }
2767           }
2768         } else { /* allcolumns */
2769           for (k=0; k<ncols; k++) {
2770             *mat_j++ = cols[k];  /* global col index! */
2771             *mat_a++ = vals[k];
2772             ilen_row++;
2773           }
2774         }
2775         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2776 
2777         imat_ilen[row] = ilen_row;
2778       }
2779     }
2780   }
2781 
2782   /* Now assemble the off proc rows */
2783   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2784   for (tmp2=0; tmp2<nrqs; tmp2++) {
2785     sbuf1_i = sbuf1[pa[tmp2]];
2786     jmax    = sbuf1_i[0];
2787     ct1     = 2*jmax + 1;
2788     ct2     = 0;
2789     rbuf2_i = rbuf2[tmp2];
2790     rbuf3_i = rbuf3[tmp2];
2791     rbuf4_i = rbuf4[tmp2];
2792     for (j=1; j<=jmax; j++) {
2793       is_no     = sbuf1_i[2*j-1];
2794       rmap_i    = rmap[is_no];
2795       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2796       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2797       imat_ilen = subc->ilen;
2798       imat_j    = subc->j;
2799       imat_i    = subc->i;
2800       imat_a    = subc->a;
2801       max1      = sbuf1_i[2*j];
2802       for (k=0; k<max1; k++,ct1++) {
2803         row = sbuf1_i[ct1];
2804 #if defined(PETSC_USE_CTABLE)
2805         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2806         row--;
2807 #else
2808         row = rmap_i[row];
2809 #endif
2810         ilen  = imat_ilen[row];
2811         mat_i = imat_i[row];
2812         mat_a = imat_a + mat_i;
2813         mat_j = imat_j + mat_i;
2814         max2  = rbuf2_i[ct1];
2815         if (!allcolumns[is_no]) {
2816           for (l=0; l<max2; l++,ct2++) {
2817 #if defined(PETSC_USE_CTABLE)
2818             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2819 #else
2820             tcol = cmap_i[rbuf3_i[ct2]];
2821 #endif
2822             if (tcol) {
2823               *mat_j++ = tcol - 1;
2824               *mat_a++ = rbuf4_i[ct2];
2825               ilen++;
2826             }
2827           }
2828         } else { /* allcolumns */
2829           for (l=0; l<max2; l++,ct2++) {
2830             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2831             *mat_a++ = rbuf4_i[ct2];
2832             ilen++;
2833           }
2834         }
2835         imat_ilen[row] = ilen;
2836       }
2837     }
2838   }
2839 
2840   if (!iscsorted) { /* sort column indices of the rows */
2841     for (i=0; i<ismax; i++) {
2842       subc      = (Mat_SeqAIJ*)submats[i]->data;
2843       imat_j    = subc->j;
2844       imat_i    = subc->i;
2845       imat_a    = subc->a;
2846       imat_ilen = subc->ilen;
2847 
2848       if (allcolumns[i]) continue;
2849       jmax = nrow[i];
2850       for (j=0; j<jmax; j++) {
2851         mat_i = imat_i[j];
2852         mat_a = imat_a + mat_i;
2853         mat_j = imat_j + mat_i;
2854         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2855       }
2856     }
2857   }
2858 
2859   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2860   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2861   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);}
2862   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2863   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2864 
2865   /* Restore the indices */
2866   for (i=0; i<ismax; i++) {
2867     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2868     if (!allcolumns[i]) {
2869       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2870     }
2871   }
2872 
2873   for (i=0; i<ismax; i++) {
2874     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2875     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2876   }
2877 
2878   /* Destroy allocated memory */
2879   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2880   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2881   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2882 
2883   for (i=0; i<nrqs; ++i) {
2884     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2885   }
2886   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2887 
2888   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2889   PetscFunctionReturn(0);
2890 }
2891 
2892 /*
2893  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2894  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2895  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2896  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2897  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2898  state, and needs to be "assembled" later by compressing B's column space.
2899 
2900  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2901  Following this call, C->A & C->B have been created, even if empty.
2902  */
2903 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2904 {
2905   /* If making this function public, change the error returned in this function away from _PLIB. */
2906   PetscErrorCode ierr;
2907   Mat_MPIAIJ     *aij;
2908   Mat_SeqAIJ     *Baij;
2909   PetscBool      seqaij,Bdisassembled;
2910   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2911   PetscScalar    v;
2912   const PetscInt *rowindices,*colindices;
2913 
2914   PetscFunctionBegin;
2915   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2916   if (A) {
2917     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2918     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2919     if (rowemb) {
2920       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2921       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2922     } else {
2923       if (C->rmap->n != A->rmap->n) {
2924         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2925       }
2926     }
2927     if (dcolemb) {
2928       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2929       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2930     } else {
2931       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2932     }
2933   }
2934   if (B) {
2935     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2936     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2937     if (rowemb) {
2938       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2939       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2940     } else {
2941       if (C->rmap->n != B->rmap->n) {
2942         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2943       }
2944     }
2945     if (ocolemb) {
2946       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2947       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2948     } else {
2949       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2950     }
2951   }
2952 
2953   aij = (Mat_MPIAIJ*)C->data;
2954   if (!aij->A) {
2955     /* Mimic parts of MatMPIAIJSetPreallocation() */
2956     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2957     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2958     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2959     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2960     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2961   }
2962   if (A) {
2963     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2964   } else {
2965     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2966   }
2967   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2968     /*
2969       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2970       need to "disassemble" B -- convert it to using C's global indices.
2971       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2972 
2973       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2974       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2975 
2976       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2977       At least avoid calling MatSetValues() and the implied searches?
2978     */
2979 
2980     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2981 #if defined(PETSC_USE_CTABLE)
2982       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2983 #else
2984       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2985       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2986       if (aij->B) {
2987         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2988       }
2989 #endif
2990       ngcol = 0;
2991       if (aij->lvec) {
2992         ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2993       }
2994       if (aij->garray) {
2995         ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2996         ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2997       }
2998       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2999       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
3000     }
3001     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
3002       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
3003     }
3004     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3005       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
3006     }
3007   }
3008   Bdisassembled = PETSC_FALSE;
3009   if (!aij->B) {
3010     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3011     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3012     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3013     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3014     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3015     Bdisassembled = PETSC_TRUE;
3016   }
3017   if (B) {
3018     Baij = (Mat_SeqAIJ*)B->data;
3019     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3020       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3021       for (i=0; i<B->rmap->n; i++) {
3022         nz[i] = Baij->i[i+1] - Baij->i[i];
3023       }
3024       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3025       ierr = PetscFree(nz);CHKERRQ(ierr);
3026     }
3027 
3028     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3029     shift = rend-rstart;
3030     count = 0;
3031     rowindices = NULL;
3032     colindices = NULL;
3033     if (rowemb) {
3034       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3035     }
3036     if (ocolemb) {
3037       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3038     }
3039     for (i=0; i<B->rmap->n; i++) {
3040       PetscInt row;
3041       row = i;
3042       if (rowindices) row = rowindices[i];
3043       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3044         col  = Baij->j[count];
3045         if (colindices) col = colindices[col];
3046         if (Bdisassembled && col>=rstart) col += shift;
3047         v    = Baij->a[count];
3048         ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3049         ++count;
3050       }
3051     }
3052     /* No assembly for aij->B is necessary. */
3053     /* FIXME: set aij->B's nonzerostate correctly. */
3054   } else {
3055     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3056   }
3057   C->preallocated  = PETSC_TRUE;
3058   C->was_assembled = PETSC_FALSE;
3059   C->assembled     = PETSC_FALSE;
3060    /*
3061       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3062       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3063    */
3064   PetscFunctionReturn(0);
3065 }
3066 
3067 /*
3068   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3069  */
3070 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3071 {
3072   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3073 
3074   PetscFunctionBegin;
3075   PetscValidPointer(A,2);
3076   PetscValidPointer(B,3);
3077   /* FIXME: make sure C is assembled */
3078   *A = aij->A;
3079   *B = aij->B;
3080   /* Note that we don't incref *A and *B, so be careful! */
3081   PetscFunctionReturn(0);
3082 }
3083 
3084 /*
3085   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3086   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3087 */
3088 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3089                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3090                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3091                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3092                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3093 {
3094   PetscErrorCode ierr;
3095   PetscMPIInt    size,flag;
3096   PetscInt       i,ii,cismax,ispar;
3097   Mat            *A,*B;
3098   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3099 
3100   PetscFunctionBegin;
3101   if (!ismax) PetscFunctionReturn(0);
3102 
3103   for (i = 0, cismax = 0; i < ismax; ++i) {
3104     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr);
3105     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3106     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr);
3107     if (size > 1) ++cismax;
3108   }
3109 
3110   /*
3111      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3112      ispar counts the number of parallel ISs across C's comm.
3113   */
3114   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3115   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3116     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3117     PetscFunctionReturn(0);
3118   }
3119 
3120   /* if (ispar) */
3121   /*
3122     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3123     These are used to extract the off-diag portion of the resulting parallel matrix.
3124     The row IS for the off-diag portion is the same as for the diag portion,
3125     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3126   */
3127   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3128   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3129   for (i = 0, ii = 0; i < ismax; ++i) {
3130     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3131     if (size > 1) {
3132       /*
3133          TODO: This is the part that's ***NOT SCALABLE***.
3134          To fix this we need to extract just the indices of C's nonzero columns
3135          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3136          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3137          to be done without serializing on the IS list, so, most likely, it is best
3138          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3139       */
3140       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3141       /* Now we have to
3142          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3143              were sorted on each rank, concatenated they might no longer be sorted;
3144          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3145              indices in the nondecreasing order to the original index positions.
3146          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3147       */
3148       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3149       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3150       ++ii;
3151     }
3152   }
3153   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3154   for (i = 0, ii = 0; i < ismax; ++i) {
3155     PetscInt       j,issize;
3156     const PetscInt *indices;
3157 
3158     /*
3159        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3160      */
3161     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3162     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3163     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3164     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3165     for (j = 1; j < issize; ++j) {
3166       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3167     }
3168     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3169     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3170     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3171     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3172     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3173     for (j = 1; j < issize; ++j) {
3174       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3175     }
3176     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3177     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3178     if (size > 1) {
3179       cisrow[ii] = isrow[i];
3180       ++ii;
3181     }
3182   }
3183   /*
3184     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3185     array of sequential matrices underlying the resulting parallel matrices.
3186     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3187     contain duplicates.
3188 
3189     There are as many diag matrices as there are original index sets. There are only as many parallel
3190     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3191 
3192     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3193     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3194       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3195       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3196       with A[i] and B[ii] extracted from the corresponding MPI submat.
3197     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3198       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3199       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3200       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3201       values into A[i] and B[ii] sitting inside the corresponding submat.
3202     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3203       A[i], B[ii], AA[i] or BB[ii] matrices.
3204   */
3205   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3206   if (scall == MAT_INITIAL_MATRIX) {
3207     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3208   }
3209 
3210   /* Now obtain the sequential A and B submatrices separately. */
3211   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3212   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3213   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3214 
3215   /*
3216     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3217     matrices A & B have been extracted directly into the parallel matrices containing them, or
3218     simply into the sequential matrix identical with the corresponding A (if size == 1).
3219     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3220     to have the same sparsity pattern.
3221     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3222     must be constructed for C. This is done by setseqmat(s).
3223   */
3224   for (i = 0, ii = 0; i < ismax; ++i) {
3225     /*
3226        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3227        That way we can avoid sorting and computing permutations when reusing.
3228        To this end:
3229         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3230         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3231           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3232     */
3233     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3234 
3235     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3236     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3237     if (size > 1) {
3238       if (scall == MAT_INITIAL_MATRIX) {
3239         ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3240         ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3241         ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3242         ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3243         ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3244       }
3245       /*
3246         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3247       */
3248       {
3249         Mat AA = A[i],BB = B[ii];
3250 
3251         if (AA || BB) {
3252           ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3253           ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3254           ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3255         }
3256         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3257       }
3258       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3259       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3260       ++ii;
3261     } else { /* if (size == 1) */
3262       if (scall == MAT_REUSE_MATRIX) {
3263         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3264       }
3265       if (isrow_p[i] || iscol_p[i]) {
3266         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3267         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3268         /* Otherwise A is extracted straight into (*submats)[i]. */
3269         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3270         ierr = MatDestroy(A+i);CHKERRQ(ierr);
3271       } else (*submat)[i] = A[i];
3272     }
3273     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3274     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3275   }
3276   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3277   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3278   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3279   ierr = PetscFree(A);CHKERRQ(ierr);
3280   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3281   PetscFunctionReturn(0);
3282 }
3283 
3284 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3285 {
3286   PetscErrorCode ierr;
3287 
3288   PetscFunctionBegin;
3289   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3290   PetscFunctionReturn(0);
3291 }
3292