xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision af4fa82cc29c77689f3cd2af837601dbdc3602c2)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++) {
82     length_i  = length[i];
83     indices_i = indices[i];
84     for (j=0; j<length_i; j++) {
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank) {
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows) {
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++) {
115     if (tosizes_temp[i]) {
116       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117       tosizes_temp[i] = nto; /* a map from processor to index */
118       toranks[nto++]  = i; /* processor */
119     }
120   }
121   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   toffsets[0] = 0;
123   for (i=0; i<nto; i++) {
124     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
125     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
126   }
127   /* send information to other processors */
128   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
129   nrecvrows = 0;
130   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
131   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
132   nrecvrows = 0;
133   for (i=0; i<nfrom; i++) {
134     for (j=0; j<fromsizes[2*i]; j++) {
135       remote[nrecvrows].rank    = fromranks[i];
136       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
137     }
138   }
139   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
140   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
141   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
142   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
143   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
144   /* message pair <no of is, row>  */
145   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
146   for (i=0; i<nrrows; i++) {
147     owner = rrow_ranks[i]; /* processor */
148     j     = tosizes_temp[owner]; /* index */
149     todata[toffsets[j]++] = rrow_isids[i];
150     todata[toffsets[j]++] = remoterows[i];
151   }
152   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
153   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
154   ierr = PetscFree(toffsets);CHKERRQ(ierr);
155   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
156   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
157   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
158   /* send rows belonging to the remote so that then we could get the overlapping data back */
159   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
160   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
161   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
162   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
163   ierr = PetscFree(fromranks);CHKERRQ(ierr);
164   nrecvrows = 0;
165   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
166   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
167   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
168   nrecvrows = 0;
169   for (i=0; i<nto; i++) {
170     for (j=0; j<tosizes[2*i]; j++) {
171       remote[nrecvrows].rank    = toranks[i];
172       remote[nrecvrows++].index = tosizes[2*i+1]+j;
173     }
174   }
175   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
176   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
177   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
178   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
179   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
180   /* overlap communication and computation */
181   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
182   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
183   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
184   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
185   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
186   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
187   ierr = PetscFree(toranks);CHKERRQ(ierr);
188   ierr = PetscFree(tosizes);CHKERRQ(ierr);
189   ierr = PetscFree(todata);CHKERRQ(ierr);
190   PetscFunctionReturn(0);
191 }
192 
193 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
194 {
195   PetscInt         *isz,isz_i,i,j,is_id, data_size;
196   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
197   const PetscInt   *indices_i_temp;
198   MPI_Comm         *iscomms;
199   PetscErrorCode    ierr;
200 
201   PetscFunctionBegin;
202   max_lsize = 0;
203   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
204   for (i=0; i<nidx; i++){
205     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
206     max_lsize = lsize>max_lsize ? lsize:max_lsize;
207     isz[i]    = lsize;
208   }
209   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
212     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
214     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
215     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
216   }
217   /* retrieve information to get row id and its overlap */
218   for (i=0; i<nrecvs;){
219     is_id     = recvdata[i++];
220     data_size = recvdata[i++];
221     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
222     isz_i     = isz[is_id];
223     for (j=0; j< data_size; j++){
224       col = recvdata[i++];
225       indices_i[isz_i++] = col;
226     }
227     isz[is_id] = isz_i;
228   }
229   /* remove duplicate entities */
230   for (i=0; i<nidx; i++){
231     indices_i = indices_temp+(max_lsize+nrecvs)*i;
232     isz_i     = isz[i];
233     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
234     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
235     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
236   }
237   ierr = PetscFree(isz);CHKERRQ(ierr);
238   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
243 {
244   PetscLayout       rmap,cmap;
245   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
246   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
247   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
248   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
249   Mat               amat,bmat;
250   PetscMPIInt       rank;
251   PetscBool         done;
252   MPI_Comm          comm;
253   PetscErrorCode    ierr;
254 
255   PetscFunctionBegin;
256   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
257   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
258   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
259   /* Even if the mat is symmetric, we still assume it is not symmetric */
260   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
263   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
264   /* total number of nonzero values is used to estimate the memory usage in the next step */
265   tnz  = ai[an]+bi[bn];
266   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
267   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
268   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
269   /* to find the longest message */
270   max_fszs = 0;
271   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
272   /* better way to estimate number of nonzero in the mat??? */
273   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
274   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
275   rows_pos  = 0;
276   totalrows = 0;
277   for (i=0; i<nfrom; i++){
278     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
279     /* group data together */
280     for (j=0; j<fromsizes[2*i]; j+=2){
281       is_id                       = fromrows[rows_pos++];/* no of is */
282       rows_i                      = rows_data[is_id];
283       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
284     }
285     /* estimate a space to avoid multiple allocations  */
286     for (j=0; j<nidx; j++){
287       indvc_ij = 0;
288       rows_i   = rows_data[j];
289       for (l=0; l<rows_pos_i[j]; l++){
290         row    = rows_i[l]-rstart;
291         start  = ai[row];
292         end    = ai[row+1];
293         for (k=start; k<end; k++){ /* Amat */
294           col = aj[k] + cstart;
295           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
296         }
297         start = bi[row];
298         end   = bi[row+1];
299         for (k=start; k<end; k++) { /* Bmat */
300           col = gcols[bj[k]];
301           indices_tmp[indvc_ij++] = col;
302         }
303       }
304       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
305       indv_counts[i*nidx+j] = indvc_ij;
306       totalrows            += indvc_ij;
307     }
308   }
309   /* message triple <no of is, number of rows, rows> */
310   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
311   totalrows = 0;
312   rows_pos  = 0;
313   /* use this code again */
314   for (i=0;i<nfrom;i++){
315     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
316     for (j=0; j<fromsizes[2*i]; j+=2){
317       is_id                       = fromrows[rows_pos++];
318       rows_i                      = rows_data[is_id];
319       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
320     }
321     /* add data  */
322     for (j=0; j<nidx; j++){
323       if (!indv_counts[i*nidx+j]) continue;
324       indvc_ij = 0;
325       sbdata[totalrows++] = j;
326       sbdata[totalrows++] = indv_counts[i*nidx+j];
327       sbsizes[2*i]       += 2;
328       rows_i              = rows_data[j];
329       for (l=0; l<rows_pos_i[j]; l++){
330         row   = rows_i[l]-rstart;
331         start = ai[row];
332         end   = ai[row+1];
333         for (k=start; k<end; k++){ /* Amat */
334           col = aj[k] + cstart;
335           indices_tmp[indvc_ij++] = col;
336         }
337         start = bi[row];
338         end   = bi[row+1];
339         for (k=start; k<end; k++) { /* Bmat */
340           col = gcols[bj[k]];
341           indices_tmp[indvc_ij++] = col;
342         }
343       }
344       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
345       sbsizes[2*i]  += indvc_ij;
346       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
347       totalrows += indvc_ij;
348     }
349   }
350   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
351   offsets[0] = 0;
352   for (i=0; i<nfrom; i++){
353     offsets[i+1]   = offsets[i] + sbsizes[2*i];
354     sbsizes[2*i+1] = offsets[i];
355   }
356   ierr = PetscFree(offsets);CHKERRQ(ierr);
357   if (sbrowsizes) *sbrowsizes = sbsizes;
358   if (sbrows) *sbrows = sbdata;
359   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
360   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
361   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
366 {
367   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
368   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
369   PetscInt          lsize,lsize_tmp;
370   PetscMPIInt       rank,owner;
371   Mat               amat,bmat;
372   PetscBool         done;
373   PetscLayout       cmap,rmap;
374   MPI_Comm          comm;
375   PetscErrorCode    ierr;
376 
377   PetscFunctionBegin;
378   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
379   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
380   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
381   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
384   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
385   /* is it a safe way to compute number of nonzero values ? */
386   tnz  = ai[an]+bi[bn];
387   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
388   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
389   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
390   /* it is a better way to estimate memory than the old implementation
391    * where global size of matrix is used
392    * */
393   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
394   for (i=0; i<nidx; i++) {
395     MPI_Comm iscomm;
396 
397     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
398     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
399     lsize_tmp = 0;
400     for (j=0; j<lsize; j++) {
401       owner = -1;
402       row   = indices[j];
403       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
404       if (owner != rank) continue;
405       /* local number */
406       row  -= rstart;
407       start = ai[row];
408       end   = ai[row+1];
409       for (k=start; k<end; k++) { /* Amat */
410         col = aj[k] + cstart;
411         indices_temp[lsize_tmp++] = col;
412       }
413       start = bi[row];
414       end   = bi[row+1];
415       for (k=start; k<end; k++) { /* Bmat */
416         col = gcols[bj[k]];
417         indices_temp[lsize_tmp++] = col;
418       }
419     }
420    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
421    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
422    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
423    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
424    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
425    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
426   }
427   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
428   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
429   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
430   PetscFunctionReturn(0);
431 }
432 
433 
434 /*
435   Sample message format:
436   If a processor A wants processor B to process some elements corresponding
437   to index sets is[1],is[5]
438   mesg [0] = 2   (no of index sets in the mesg)
439   -----------
440   mesg [1] = 1 => is[1]
441   mesg [2] = sizeof(is[1]);
442   -----------
443   mesg [3] = 5  => is[5]
444   mesg [4] = sizeof(is[5]);
445   -----------
446   mesg [5]
447   mesg [n]  datas[1]
448   -----------
449   mesg[n+1]
450   mesg[m]  data(is[5])
451   -----------
452 
453   Notes:
454   nrqs - no of requests sent (or to be sent out)
455   nrqr - no of requests received (which have to be or which have been processed)
456 */
457 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
458 {
459   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
460   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
461   const PetscInt **idx,*idx_i;
462   PetscInt       *n,**data,len;
463 #if defined(PETSC_USE_CTABLE)
464   PetscTable     *table_data,table_data_i;
465   PetscInt       *tdata,tcount,tcount_max;
466 #else
467   PetscInt       *data_i,*d_p;
468 #endif
469   PetscErrorCode ierr;
470   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
471   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
472   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
473   PetscBT        *table;
474   MPI_Comm       comm;
475   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476   MPI_Status     *s_status,*recv_status;
477   MPI_Comm       *iscomms;
478   char           *t_p;
479 
480   PetscFunctionBegin;
481   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
482   size = c->size;
483   rank = c->rank;
484   M    = C->rmap->N;
485 
486   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
487   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
488 
489   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
490 
491   for (i=0; i<imax; i++) {
492     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
493     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
494   }
495 
496   /* evaluate communication - mesg to who,length of mesg, and buffer space
497      required. Based on this, buffers are allocated, and data copied into them  */
498   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
499   for (i=0; i<imax; i++) {
500     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
501     idx_i = idx[i];
502     len   = n[i];
503     for (j=0; j<len; j++) {
504       row = idx_i[j];
505       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
506       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
507       w4[proc]++;
508     }
509     for (j=0; j<size; j++) {
510       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
511     }
512   }
513 
514   nrqs     = 0;              /* no of outgoing messages */
515   msz      = 0;              /* total mesg length (for all proc */
516   w1[rank] = 0;              /* no mesg sent to intself */
517   w3[rank] = 0;
518   for (i=0; i<size; i++) {
519     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
520   }
521   /* pa - is list of processors to communicate with */
522   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
523   for (i=0,j=0; i<size; i++) {
524     if (w1[i]) {pa[j] = i; j++;}
525   }
526 
527   /* Each message would have a header = 1 + 2*(no of IS) + data */
528   for (i=0; i<nrqs; i++) {
529     j      = pa[i];
530     w1[j] += w2[j] + 2*w3[j];
531     msz   += w1[j];
532   }
533 
534   /* Determine the number of messages to expect, their lengths, from from-ids */
535   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
536   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
537 
538   /* Now post the Irecvs corresponding to these messages */
539   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
540 
541   /* Allocate Memory for outgoing messages */
542   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
543   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
544   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
545 
546   {
547     PetscInt *iptr = tmp,ict  = 0;
548     for (i=0; i<nrqs; i++) {
549       j         = pa[i];
550       iptr     +=  ict;
551       outdat[j] = iptr;
552       ict       = w1[j];
553     }
554   }
555 
556   /* Form the outgoing messages */
557   /* plug in the headers */
558   for (i=0; i<nrqs; i++) {
559     j            = pa[i];
560     outdat[j][0] = 0;
561     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
562     ptr[j]       = outdat[j] + 2*w3[j] + 1;
563   }
564 
565   /* Memory for doing local proc's work */
566   {
567     PetscInt M_BPB_imax = 0;
568 #if defined(PETSC_USE_CTABLE)
569     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
570     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
571     for (i=0; i<imax; i++) {
572       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
573     }
574     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
575     for (i=0; i<imax; i++) {
576       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
577     }
578 #else
579     PetscInt Mimax = 0;
580     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
581     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
582     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
583     for (i=0; i<imax; i++) {
584       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
585       data[i]  = d_p + M*i;
586     }
587 #endif
588   }
589 
590   /* Parse the IS and update local tables and the outgoing buf with the data */
591   {
592     PetscInt n_i,isz_i,*outdat_j,ctr_j;
593     PetscBT  table_i;
594 
595     for (i=0; i<imax; i++) {
596       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
597       n_i     = n[i];
598       table_i = table[i];
599       idx_i   = idx[i];
600 #if defined(PETSC_USE_CTABLE)
601       table_data_i = table_data[i];
602 #else
603       data_i  = data[i];
604 #endif
605       isz_i   = isz[i];
606       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
607         row  = idx_i[j];
608         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
609         if (proc != rank) { /* copy to the outgoing buffer */
610           ctr[proc]++;
611           *ptr[proc] = row;
612           ptr[proc]++;
613         } else if (!PetscBTLookupSet(table_i,row)) {
614 #if defined(PETSC_USE_CTABLE)
615           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
616 #else
617           data_i[isz_i] = row; /* Update the local table */
618 #endif
619           isz_i++;
620         }
621       }
622       /* Update the headers for the current IS */
623       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
624         if ((ctr_j = ctr[j])) {
625           outdat_j        = outdat[j];
626           k               = ++outdat_j[0];
627           outdat_j[2*k]   = ctr_j;
628           outdat_j[2*k-1] = i;
629         }
630       }
631       isz[i] = isz_i;
632     }
633   }
634 
635   /*  Now  post the sends */
636   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
637   for (i=0; i<nrqs; ++i) {
638     j    = pa[i];
639     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
640   }
641 
642   /* No longer need the original indices */
643   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
644   for (i=0; i<imax; ++i) {
645     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
646     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
647   }
648   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
649 
650   for (i=0; i<imax; ++i) {
651     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
652   }
653 
654   /* Do Local work */
655 #if defined(PETSC_USE_CTABLE)
656   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
657 #else
658   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
659 #endif
660 
661   /* Receive messages */
662   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
663   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);}
664 
665   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
666   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRMPI(ierr);}
667 
668   /* Phase 1 sends are complete - deallocate buffers */
669   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
670   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
671 
672   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
673   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
674   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
675   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
676   ierr = PetscFree(rbuf);CHKERRQ(ierr);
677 
678 
679   /* Send the data back */
680   /* Do a global reduction to know the buffer space req for incoming messages */
681   {
682     PetscMPIInt *rw1;
683 
684     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
685 
686     for (i=0; i<nrqr; ++i) {
687       proc = recv_status[i].MPI_SOURCE;
688 
689       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
690       rw1[proc] = isz1[i];
691     }
692     ierr = PetscFree(onodes1);CHKERRQ(ierr);
693     ierr = PetscFree(olengths1);CHKERRQ(ierr);
694 
695     /* Determine the number of messages to expect, their lengths, from from-ids */
696     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
697     ierr = PetscFree(rw1);CHKERRQ(ierr);
698   }
699   /* Now post the Irecvs corresponding to these messages */
700   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
701 
702   /* Now  post the sends */
703   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
704   for (i=0; i<nrqr; ++i) {
705     j    = recv_status[i].MPI_SOURCE;
706     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr);
707   }
708 
709   /* receive work done on other processors */
710   {
711     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
712     PetscMPIInt idex;
713     PetscBT     table_i;
714     MPI_Status  *status2;
715 
716     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
717     for (i=0; i<nrqs; ++i) {
718       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRMPI(ierr);
719       /* Process the message */
720       rbuf2_i = rbuf2[idex];
721       ct1     = 2*rbuf2_i[0]+1;
722       jmax    = rbuf2[idex][0];
723       for (j=1; j<=jmax; j++) {
724         max     = rbuf2_i[2*j];
725         is_no   = rbuf2_i[2*j-1];
726         isz_i   = isz[is_no];
727         table_i = table[is_no];
728 #if defined(PETSC_USE_CTABLE)
729         table_data_i = table_data[is_no];
730 #else
731         data_i  = data[is_no];
732 #endif
733         for (k=0; k<max; k++,ct1++) {
734           row = rbuf2_i[ct1];
735           if (!PetscBTLookupSet(table_i,row)) {
736 #if defined(PETSC_USE_CTABLE)
737             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
738 #else
739             data_i[isz_i] = row;
740 #endif
741             isz_i++;
742           }
743         }
744         isz[is_no] = isz_i;
745       }
746     }
747 
748     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRMPI(ierr);}
749     ierr = PetscFree(status2);CHKERRQ(ierr);
750   }
751 
752 #if defined(PETSC_USE_CTABLE)
753   tcount_max = 0;
754   for (i=0; i<imax; ++i) {
755     table_data_i = table_data[i];
756     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
757     if (tcount_max < tcount) tcount_max = tcount;
758   }
759   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
760 #endif
761 
762   for (i=0; i<imax; ++i) {
763 #if defined(PETSC_USE_CTABLE)
764     PetscTablePosition tpos;
765     table_data_i = table_data[i];
766 
767     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
768     while (tpos) {
769       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
770       tdata[--j] = --k;
771     }
772     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
773 #else
774     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
775 #endif
776     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
777   }
778 
779   ierr = PetscFree(iscomms);CHKERRQ(ierr);
780   ierr = PetscFree(onodes2);CHKERRQ(ierr);
781   ierr = PetscFree(olengths2);CHKERRQ(ierr);
782 
783   ierr = PetscFree(pa);CHKERRQ(ierr);
784   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
785   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
786   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
787   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
788   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
789   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
790   ierr = PetscFree(s_status);CHKERRQ(ierr);
791   ierr = PetscFree(recv_status);CHKERRQ(ierr);
792   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
793   ierr = PetscFree(xdata);CHKERRQ(ierr);
794   ierr = PetscFree(isz1);CHKERRQ(ierr);
795 #if defined(PETSC_USE_CTABLE)
796   for (i=0; i<imax; i++) {
797     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
798   }
799   ierr = PetscFree(table_data);CHKERRQ(ierr);
800   ierr = PetscFree(tdata);CHKERRQ(ierr);
801   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
802 #else
803   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
804 #endif
805   PetscFunctionReturn(0);
806 }
807 
808 /*
809    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
810        the work on the local processor.
811 
812      Inputs:
813       C      - MAT_MPIAIJ;
814       imax - total no of index sets processed at a time;
815       table  - an array of char - size = m bits.
816 
817      Output:
818       isz    - array containing the count of the solution elements corresponding
819                to each index set;
820       data or table_data  - pointer to the solutions
821 */
822 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
823 {
824   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
825   Mat        A  = c->A,B = c->B;
826   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
827   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
828   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
829   PetscBT    table_i;
830 #if defined(PETSC_USE_CTABLE)
831   PetscTable         table_data_i;
832   PetscErrorCode     ierr;
833   PetscTablePosition tpos;
834   PetscInt           tcount,*tdata;
835 #else
836   PetscInt           *data_i;
837 #endif
838 
839   PetscFunctionBegin;
840   rstart = C->rmap->rstart;
841   cstart = C->cmap->rstart;
842   ai     = a->i;
843   aj     = a->j;
844   bi     = b->i;
845   bj     = b->j;
846   garray = c->garray;
847 
848   for (i=0; i<imax; i++) {
849 #if defined(PETSC_USE_CTABLE)
850     /* copy existing entries of table_data_i into tdata[] */
851     table_data_i = table_data[i];
852     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
853     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
854 
855     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
856     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
857     while (tpos) {
858       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
859       tdata[--j] = --row;
860       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
861     }
862 #else
863     data_i  = data[i];
864 #endif
865     table_i = table[i];
866     isz_i   = isz[i];
867     max     = isz[i];
868 
869     for (j=0; j<max; j++) {
870 #if defined(PETSC_USE_CTABLE)
871       row   = tdata[j] - rstart;
872 #else
873       row   = data_i[j] - rstart;
874 #endif
875       start = ai[row];
876       end   = ai[row+1];
877       for (k=start; k<end; k++) { /* Amat */
878         val = aj[k] + cstart;
879         if (!PetscBTLookupSet(table_i,val)) {
880 #if defined(PETSC_USE_CTABLE)
881           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
882 #else
883           data_i[isz_i] = val;
884 #endif
885           isz_i++;
886         }
887       }
888       start = bi[row];
889       end   = bi[row+1];
890       for (k=start; k<end; k++) { /* Bmat */
891         val = garray[bj[k]];
892         if (!PetscBTLookupSet(table_i,val)) {
893 #if defined(PETSC_USE_CTABLE)
894           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
895 #else
896           data_i[isz_i] = val;
897 #endif
898           isz_i++;
899         }
900       }
901     }
902     isz[i] = isz_i;
903 
904 #if defined(PETSC_USE_CTABLE)
905     ierr = PetscFree(tdata);CHKERRQ(ierr);
906 #endif
907   }
908   PetscFunctionReturn(0);
909 }
910 
911 /*
912       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
913          and return the output
914 
915          Input:
916            C    - the matrix
917            nrqr - no of messages being processed.
918            rbuf - an array of pointers to the received requests
919 
920          Output:
921            xdata - array of messages to be sent back
922            isz1  - size of each message
923 
924   For better efficiency perhaps we should malloc separately each xdata[i],
925 then if a remalloc is required we need only copy the data for that one row
926 rather then all previous rows as it is now where a single large chunck of
927 memory is used.
928 
929 */
930 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
931 {
932   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
933   Mat            A  = c->A,B = c->B;
934   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
935   PetscErrorCode ierr;
936   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
937   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
938   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
939   PetscInt       *rbuf_i,kmax,rbuf_0;
940   PetscBT        xtable;
941 
942   PetscFunctionBegin;
943   m      = C->rmap->N;
944   rstart = C->rmap->rstart;
945   cstart = C->cmap->rstart;
946   ai     = a->i;
947   aj     = a->j;
948   bi     = b->i;
949   bj     = b->j;
950   garray = c->garray;
951 
952 
953   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
954     rbuf_i =  rbuf[i];
955     rbuf_0 =  rbuf_i[0];
956     ct    += rbuf_0;
957     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
958   }
959 
960   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
961   else max1 = 1;
962   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
963   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
964   ++no_malloc;
965   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
966   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
967 
968   ct3 = 0;
969   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
970     rbuf_i =  rbuf[i];
971     rbuf_0 =  rbuf_i[0];
972     ct1    =  2*rbuf_0+1;
973     ct2    =  ct1;
974     ct3   += ct1;
975     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
976       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
977       oct2 = ct2;
978       kmax = rbuf_i[2*j];
979       for (k=0; k<kmax; k++,ct1++) {
980         row = rbuf_i[ct1];
981         if (!PetscBTLookupSet(xtable,row)) {
982           if (!(ct3 < mem_estimate)) {
983             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
984             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
985             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
986             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
987             xdata[0]     = tmp;
988             mem_estimate = new_estimate; ++no_malloc;
989             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
990           }
991           xdata[i][ct2++] = row;
992           ct3++;
993         }
994       }
995       for (k=oct2,max2=ct2; k<max2; k++) {
996         row   = xdata[i][k] - rstart;
997         start = ai[row];
998         end   = ai[row+1];
999         for (l=start; l<end; l++) {
1000           val = aj[l] + cstart;
1001           if (!PetscBTLookupSet(xtable,val)) {
1002             if (!(ct3 < mem_estimate)) {
1003               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1004               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1005               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1006               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1007               xdata[0]     = tmp;
1008               mem_estimate = new_estimate; ++no_malloc;
1009               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1010             }
1011             xdata[i][ct2++] = val;
1012             ct3++;
1013           }
1014         }
1015         start = bi[row];
1016         end   = bi[row+1];
1017         for (l=start; l<end; l++) {
1018           val = garray[bj[l]];
1019           if (!PetscBTLookupSet(xtable,val)) {
1020             if (!(ct3 < mem_estimate)) {
1021               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1022               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1023               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1024               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1025               xdata[0]     = tmp;
1026               mem_estimate = new_estimate; ++no_malloc;
1027               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1028             }
1029             xdata[i][ct2++] = val;
1030             ct3++;
1031           }
1032         }
1033       }
1034       /* Update the header*/
1035       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1036       xdata[i][2*j-1] = rbuf_i[2*j-1];
1037     }
1038     xdata[i][0] = rbuf_0;
1039     xdata[i+1]  = xdata[i] + ct2;
1040     isz1[i]     = ct2; /* size of each message */
1041   }
1042   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1043   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1044   PetscFunctionReturn(0);
1045 }
1046 /* -------------------------------------------------------------------------*/
1047 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1048 /*
1049     Every processor gets the entire matrix
1050 */
1051 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1052 {
1053   Mat            B;
1054   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1055   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1056   PetscErrorCode ierr;
1057   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1058   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1059   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1060 
1061   PetscFunctionBegin;
1062   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1063   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr);
1064   if (scall == MAT_INITIAL_MATRIX) {
1065     /* ----------------------------------------------------------------
1066          Tell every processor the number of nonzeros per row
1067     */
1068     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1069     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1070       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1071     }
1072     ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1073     for (i=0; i<size; i++) {
1074       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1075       displs[i]     = A->rmap->range[i];
1076     }
1077 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1078     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1079 #else
1080     sendcount = A->rmap->rend - A->rmap->rstart;
1081     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1082 #endif
1083     /* ---------------------------------------------------------------
1084          Create the sequential matrix of the same type as the local block diagonal
1085     */
1086     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1087     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1088     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1089     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1090     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1091     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1092     **Bin = B;
1093     b     = (Mat_SeqAIJ*)B->data;
1094 
1095     /*--------------------------------------------------------------------
1096        Copy my part of matrix column indices over
1097     */
1098     sendcount  = ad->nz + bd->nz;
1099     jsendbuf   = b->j + b->i[rstarts[rank]];
1100     a_jsendbuf = ad->j;
1101     b_jsendbuf = bd->j;
1102     n          = A->rmap->rend - A->rmap->rstart;
1103     cnt        = 0;
1104     for (i=0; i<n; i++) {
1105       /* put in lower diagonal portion */
1106       m = bd->i[i+1] - bd->i[i];
1107       while (m > 0) {
1108         /* is it above diagonal (in bd (compressed) numbering) */
1109         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1110         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1111         m--;
1112       }
1113 
1114       /* put in diagonal portion */
1115       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1116         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1117       }
1118 
1119       /* put in upper diagonal portion */
1120       while (m-- > 0) {
1121         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1122       }
1123     }
1124     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1125 
1126     /*--------------------------------------------------------------------
1127        Gather all column indices to all processors
1128     */
1129     for (i=0; i<size; i++) {
1130       recvcounts[i] = 0;
1131       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1132         recvcounts[i] += lens[j];
1133       }
1134     }
1135     displs[0] = 0;
1136     for (i=1; i<size; i++) {
1137       displs[i] = displs[i-1] + recvcounts[i-1];
1138     }
1139 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1140     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1141 #else
1142     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1143 #endif
1144     /*--------------------------------------------------------------------
1145         Assemble the matrix into useable form (note numerical values not yet set)
1146     */
1147     /* set the b->ilen (length of each row) values */
1148     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1149     /* set the b->i indices */
1150     b->i[0] = 0;
1151     for (i=1; i<=A->rmap->N; i++) {
1152       b->i[i] = b->i[i-1] + lens[i-1];
1153     }
1154     ierr = PetscFree(lens);CHKERRQ(ierr);
1155     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1156     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1157 
1158   } else {
1159     B = **Bin;
1160     b = (Mat_SeqAIJ*)B->data;
1161   }
1162 
1163   /*--------------------------------------------------------------------
1164        Copy my part of matrix numerical values into the values location
1165   */
1166   if (flag == MAT_GET_VALUES) {
1167     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1168     MatScalar         *sendbuf,*recvbuf;
1169 
1170     ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr);
1171     ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr);
1172     sendcount = ad->nz + bd->nz;
1173     sendbuf   = b->a + b->i[rstarts[rank]];
1174     a_sendbuf = ada;
1175     b_sendbuf = bda;
1176     b_sendj   = bd->j;
1177     n         = A->rmap->rend - A->rmap->rstart;
1178     cnt       = 0;
1179     for (i=0; i<n; i++) {
1180       /* put in lower diagonal portion */
1181       m = bd->i[i+1] - bd->i[i];
1182       while (m > 0) {
1183         /* is it above diagonal (in bd (compressed) numbering) */
1184         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1185         sendbuf[cnt++] = *b_sendbuf++;
1186         m--;
1187         b_sendj++;
1188       }
1189 
1190       /* put in diagonal portion */
1191       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1192         sendbuf[cnt++] = *a_sendbuf++;
1193       }
1194 
1195       /* put in upper diagonal portion */
1196       while (m-- > 0) {
1197         sendbuf[cnt++] = *b_sendbuf++;
1198         b_sendj++;
1199       }
1200     }
1201     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1202 
1203     /* -----------------------------------------------------------------
1204        Gather all numerical values to all processors
1205     */
1206     if (!recvcounts) {
1207       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1208     }
1209     for (i=0; i<size; i++) {
1210       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1211     }
1212     displs[0] = 0;
1213     for (i=1; i<size; i++) {
1214       displs[i] = displs[i-1] + recvcounts[i-1];
1215     }
1216     recvbuf = b->a;
1217 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1218     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1219 #else
1220     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1221 #endif
1222     ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr);
1223     ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr);
1224   }  /* endof (flag == MAT_GET_VALUES) */
1225   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1226 
1227   if (A->symmetric) {
1228     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1229   } else if (A->hermitian) {
1230     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1231   } else if (A->structurally_symmetric) {
1232     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1233   }
1234   PetscFunctionReturn(0);
1235 }
1236 
1237 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1238 {
1239   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1240   Mat            submat,A = c->A,B = c->B;
1241   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1242   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1243   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1244   const PetscInt *icol,*irow;
1245   PetscInt       nrow,ncol,start;
1246   PetscErrorCode ierr;
1247   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1248   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1249   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1250   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1251   PetscInt       *lens,rmax,ncols,*cols,Crow;
1252 #if defined(PETSC_USE_CTABLE)
1253   PetscTable     cmap,rmap;
1254   PetscInt       *cmap_loc,*rmap_loc;
1255 #else
1256   PetscInt       *cmap,*rmap;
1257 #endif
1258   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1259   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1260   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1261   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1262   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1263   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1264   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1265   MPI_Comm       comm;
1266   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1267   PetscMPIInt    *onodes1,*olengths1,idex,end;
1268   Mat_SubSppt    *smatis1;
1269   PetscBool      isrowsorted,iscolsorted;
1270 
1271   PetscFunctionBegin;
1272   PetscValidLogicalCollectiveInt(C,ismax,2);
1273   PetscValidLogicalCollectiveEnum(C,scall,5);
1274   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1275   ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1276   ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1277   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1278   size = c->size;
1279   rank = c->rank;
1280 
1281   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1282   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1283   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1284   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1285   if (allcolumns) {
1286     icol = NULL;
1287     ncol = C->cmap->N;
1288   } else {
1289     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1290     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1291   }
1292 
1293   if (scall == MAT_INITIAL_MATRIX) {
1294     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1295 
1296     /* Get some new tags to keep the communication clean */
1297     tag1 = ((PetscObject)C)->tag;
1298     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1299     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1300 
1301     /* evaluate communication - mesg to who, length of mesg, and buffer space
1302      required. Based on this, buffers are allocated, and data copied into them */
1303     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1304     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1305 
1306     /* w1[proc] = num of rows owned by proc -- to be requested */
1307     proc = 0;
1308     nrqs = 0; /* num of outgoing messages */
1309     for (j=0; j<nrow; j++) {
1310       row  = irow[j];
1311       if (!isrowsorted) proc = 0;
1312       while (row >= C->rmap->range[proc+1]) proc++;
1313       w1[proc]++;
1314       row2proc[j] = proc; /* map row index to proc */
1315 
1316       if (proc != rank && !w2[proc]) {
1317         w2[proc] = 1; nrqs++;
1318       }
1319     }
1320     w1[rank] = 0;  /* rows owned by self will not be requested */
1321 
1322     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1323     for (proc=0,j=0; proc<size; proc++) {
1324       if (w1[proc]) { pa[j++] = proc;}
1325     }
1326 
1327     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1328     msz = 0;              /* total mesg length (for all procs) */
1329     for (i=0; i<nrqs; i++) {
1330       proc      = pa[i];
1331       w1[proc] += 3;
1332       msz      += w1[proc];
1333     }
1334     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1335 
1336     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1337     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1338     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1339 
1340     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1341        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1342     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1343 
1344     /* Now post the Irecvs corresponding to these messages */
1345     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1346 
1347     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1348     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1349 
1350     /* Allocate Memory for outgoing messages */
1351     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1352     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1353     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1354 
1355     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1356     iptr = tmp;
1357     for (i=0; i<nrqs; i++) {
1358       proc        = pa[i];
1359       sbuf1[proc] = iptr;
1360       iptr       += w1[proc];
1361     }
1362 
1363     /* Form the outgoing messages */
1364     /* Initialize the header space */
1365     for (i=0; i<nrqs; i++) {
1366       proc      = pa[i];
1367       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1368       ptr[proc] = sbuf1[proc] + 3;
1369     }
1370 
1371     /* Parse the isrow and copy data into outbuf */
1372     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1373     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1374       proc = row2proc[j];
1375       if (proc != rank) { /* copy to the outgoing buf*/
1376         *ptr[proc] = irow[j];
1377         ctr[proc]++; ptr[proc]++;
1378       }
1379     }
1380 
1381     /* Update the headers for the current IS */
1382     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1383       if ((ctr_j = ctr[j])) {
1384         sbuf1_j        = sbuf1[j];
1385         k              = ++sbuf1_j[0];
1386         sbuf1_j[2*k]   = ctr_j;
1387         sbuf1_j[2*k-1] = 0;
1388       }
1389     }
1390 
1391     /* Now post the sends */
1392     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1393     for (i=0; i<nrqs; ++i) {
1394       proc = pa[i];
1395       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
1396     }
1397 
1398     /* Post Receives to capture the buffer size */
1399     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1400     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1401 
1402     rbuf2[0] = tmp + msz;
1403     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1404 
1405     for (i=0; i<nrqs; ++i) {
1406       proc = pa[i];
1407       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
1408     }
1409 
1410     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1411 
1412     /* Send to other procs the buf size they should allocate */
1413     /* Receive messages*/
1414     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1415     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1416 
1417     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
1418     for (i=0; i<nrqr; ++i) {
1419       req_size[i] = 0;
1420       rbuf1_i        = rbuf1[i];
1421       start          = 2*rbuf1_i[0] + 1;
1422       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
1423       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1424       sbuf2_i        = sbuf2[i];
1425       for (j=start; j<end; j++) {
1426         k            = rbuf1_i[j] - rstart;
1427         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1428         sbuf2_i[j]   = ncols;
1429         req_size[i] += ncols;
1430       }
1431       req_source1[i] = r_status1[i].MPI_SOURCE;
1432 
1433       /* form the header */
1434       sbuf2_i[0] = req_size[i];
1435       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1436 
1437       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
1438     }
1439 
1440     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1441     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1442 
1443     /* rbuf2 is received, Post recv column indices a->j */
1444     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
1445 
1446     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1447     for (i=0; i<nrqs; ++i) {
1448       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1449       req_source2[i] = r_status2[i].MPI_SOURCE;
1450       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
1451     }
1452 
1453     /* Wait on sends1 and sends2 */
1454     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1455     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);
1456     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1457     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1458 
1459     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);
1460     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1461 
1462     /* Now allocate sending buffers for a->j, and send them off */
1463     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1464     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1465     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1466     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1467 
1468     for (i=0; i<nrqr; i++) { /* for each requested message */
1469       rbuf1_i   = rbuf1[i];
1470       sbuf_aj_i = sbuf_aj[i];
1471       ct1       = 2*rbuf1_i[0] + 1;
1472       ct2       = 0;
1473       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1474 
1475       kmax = rbuf1[i][2];
1476       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1477         row    = rbuf1_i[ct1] - rstart;
1478         nzA    = ai[row+1] - ai[row];
1479         nzB    = bi[row+1] - bi[row];
1480         ncols  = nzA + nzB;
1481         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1482 
1483         /* load the column indices for this row into cols*/
1484         cols = sbuf_aj_i + ct2;
1485 
1486         lwrite = 0;
1487         for (l=0; l<nzB; l++) {
1488           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1489         }
1490         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1491         for (l=0; l<nzB; l++) {
1492           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1493         }
1494 
1495         ct2 += ncols;
1496       }
1497       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
1498     }
1499 
1500     /* create column map (cmap): global col of C -> local col of submat */
1501 #if defined(PETSC_USE_CTABLE)
1502     if (!allcolumns) {
1503       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1504       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1505       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1506         if (icol[j] >= cstart && icol[j] <cend) {
1507           cmap_loc[icol[j] - cstart] = j+1;
1508         } else { /* use PetscTable for non-local col indices */
1509           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1510         }
1511       }
1512     } else {
1513       cmap     = NULL;
1514       cmap_loc = NULL;
1515     }
1516     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1517 #else
1518     if (!allcolumns) {
1519       ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1520       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1521     } else {
1522       cmap = NULL;
1523     }
1524 #endif
1525 
1526     /* Create lens for MatSeqAIJSetPreallocation() */
1527     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1528 
1529     /* Compute lens from local part of C */
1530     for (j=0; j<nrow; j++) {
1531       row  = irow[j];
1532       proc = row2proc[j];
1533       if (proc == rank) {
1534         /* diagonal part A = c->A */
1535         ncols = ai[row-rstart+1] - ai[row-rstart];
1536         cols  = aj + ai[row-rstart];
1537         if (!allcolumns) {
1538           for (k=0; k<ncols; k++) {
1539 #if defined(PETSC_USE_CTABLE)
1540             tcol = cmap_loc[cols[k]];
1541 #else
1542             tcol = cmap[cols[k]+cstart];
1543 #endif
1544             if (tcol) lens[j]++;
1545           }
1546         } else { /* allcolumns */
1547           lens[j] = ncols;
1548         }
1549 
1550         /* off-diagonal part B = c->B */
1551         ncols = bi[row-rstart+1] - bi[row-rstart];
1552         cols  = bj + bi[row-rstart];
1553         if (!allcolumns) {
1554           for (k=0; k<ncols; k++) {
1555 #if defined(PETSC_USE_CTABLE)
1556             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1557 #else
1558             tcol = cmap[bmap[cols[k]]];
1559 #endif
1560             if (tcol) lens[j]++;
1561           }
1562         } else { /* allcolumns */
1563           lens[j] += ncols;
1564         }
1565       }
1566     }
1567 
1568     /* Create row map (rmap): global row of C -> local row of submat */
1569 #if defined(PETSC_USE_CTABLE)
1570     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1571     for (j=0; j<nrow; j++) {
1572       row  = irow[j];
1573       proc = row2proc[j];
1574       if (proc == rank) { /* a local row */
1575         rmap_loc[row - rstart] = j;
1576       } else {
1577         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1578       }
1579     }
1580 #else
1581     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1582     for (j=0; j<nrow; j++) {
1583       rmap[irow[j]] = j;
1584     }
1585 #endif
1586 
1587     /* Update lens from offproc data */
1588     /* recv a->j is done */
1589     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
1590     for (i=0; i<nrqs; i++) {
1591       proc    = pa[i];
1592       sbuf1_i = sbuf1[proc];
1593       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1594       ct1     = 2 + 1;
1595       ct2     = 0;
1596       rbuf2_i = rbuf2[i]; /* received length of C->j */
1597       rbuf3_i = rbuf3[i]; /* received C->j */
1598 
1599       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1600       max1   = sbuf1_i[2];
1601       for (k=0; k<max1; k++,ct1++) {
1602 #if defined(PETSC_USE_CTABLE)
1603         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1604         row--;
1605         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1606 #else
1607         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1608 #endif
1609         /* Now, store row index of submat in sbuf1_i[ct1] */
1610         sbuf1_i[ct1] = row;
1611 
1612         nnz = rbuf2_i[ct1];
1613         if (!allcolumns) {
1614           for (l=0; l<nnz; l++,ct2++) {
1615 #if defined(PETSC_USE_CTABLE)
1616             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1617               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1618             } else {
1619               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1620             }
1621 #else
1622             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1623 #endif
1624             if (tcol) lens[row]++;
1625           }
1626         } else { /* allcolumns */
1627           lens[row] += nnz;
1628         }
1629       }
1630     }
1631     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);
1632     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1633 
1634     /* Create the submatrices */
1635     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1636     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1637 
1638     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1639     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1640     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1641     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1642     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1643 
1644     /* create struct Mat_SubSppt and attached it to submat */
1645     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1646     subc = (Mat_SeqAIJ*)submat->data;
1647     subc->submatis1 = smatis1;
1648 
1649     smatis1->id          = 0;
1650     smatis1->nrqs        = nrqs;
1651     smatis1->nrqr        = nrqr;
1652     smatis1->rbuf1       = rbuf1;
1653     smatis1->rbuf2       = rbuf2;
1654     smatis1->rbuf3       = rbuf3;
1655     smatis1->sbuf2       = sbuf2;
1656     smatis1->req_source2 = req_source2;
1657 
1658     smatis1->sbuf1       = sbuf1;
1659     smatis1->ptr         = ptr;
1660     smatis1->tmp         = tmp;
1661     smatis1->ctr         = ctr;
1662 
1663     smatis1->pa           = pa;
1664     smatis1->req_size     = req_size;
1665     smatis1->req_source1  = req_source1;
1666 
1667     smatis1->allcolumns  = allcolumns;
1668     smatis1->singleis    = PETSC_TRUE;
1669     smatis1->row2proc    = row2proc;
1670     smatis1->rmap        = rmap;
1671     smatis1->cmap        = cmap;
1672 #if defined(PETSC_USE_CTABLE)
1673     smatis1->rmap_loc    = rmap_loc;
1674     smatis1->cmap_loc    = cmap_loc;
1675 #endif
1676 
1677     smatis1->destroy     = submat->ops->destroy;
1678     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1679     submat->factortype   = C->factortype;
1680 
1681     /* compute rmax */
1682     rmax = 0;
1683     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1684 
1685   } else { /* scall == MAT_REUSE_MATRIX */
1686     submat = submats[0];
1687     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1688 
1689     subc    = (Mat_SeqAIJ*)submat->data;
1690     rmax    = subc->rmax;
1691     smatis1 = subc->submatis1;
1692     nrqs        = smatis1->nrqs;
1693     nrqr        = smatis1->nrqr;
1694     rbuf1       = smatis1->rbuf1;
1695     rbuf2       = smatis1->rbuf2;
1696     rbuf3       = smatis1->rbuf3;
1697     req_source2 = smatis1->req_source2;
1698 
1699     sbuf1     = smatis1->sbuf1;
1700     sbuf2     = smatis1->sbuf2;
1701     ptr       = smatis1->ptr;
1702     tmp       = smatis1->tmp;
1703     ctr       = smatis1->ctr;
1704 
1705     pa         = smatis1->pa;
1706     req_size   = smatis1->req_size;
1707     req_source1 = smatis1->req_source1;
1708 
1709     allcolumns = smatis1->allcolumns;
1710     row2proc   = smatis1->row2proc;
1711     rmap       = smatis1->rmap;
1712     cmap       = smatis1->cmap;
1713 #if defined(PETSC_USE_CTABLE)
1714     rmap_loc   = smatis1->rmap_loc;
1715     cmap_loc   = smatis1->cmap_loc;
1716 #endif
1717   }
1718 
1719   /* Post recv matrix values */
1720   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1721   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1722   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1723   for (i=0; i<nrqs; ++i) {
1724     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1725     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
1726   }
1727 
1728   /* Allocate sending buffers for a->a, and send them off */
1729   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1730   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1731   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1732   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1733 
1734   for (i=0; i<nrqr; i++) {
1735     rbuf1_i   = rbuf1[i];
1736     sbuf_aa_i = sbuf_aa[i];
1737     ct1       = 2*rbuf1_i[0]+1;
1738     ct2       = 0;
1739     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1740 
1741     kmax = rbuf1_i[2];
1742     for (k=0; k<kmax; k++,ct1++) {
1743       row = rbuf1_i[ct1] - rstart;
1744       nzA = ai[row+1] - ai[row];
1745       nzB = bi[row+1] - bi[row];
1746       ncols  = nzA + nzB;
1747       cworkB = bj + bi[row];
1748       vworkA = a_a + ai[row];
1749       vworkB = b_a + bi[row];
1750 
1751       /* load the column values for this row into vals*/
1752       vals = sbuf_aa_i + ct2;
1753 
1754       lwrite = 0;
1755       for (l=0; l<nzB; l++) {
1756         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1757       }
1758       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1759       for (l=0; l<nzB; l++) {
1760         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1761       }
1762 
1763       ct2 += ncols;
1764     }
1765     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
1766   }
1767 
1768   /* Assemble submat */
1769   /* First assemble the local rows */
1770   for (j=0; j<nrow; j++) {
1771     row  = irow[j];
1772     proc = row2proc[j];
1773     if (proc == rank) {
1774       Crow = row - rstart;  /* local row index of C */
1775 #if defined(PETSC_USE_CTABLE)
1776       row = rmap_loc[Crow]; /* row index of submat */
1777 #else
1778       row = rmap[row];
1779 #endif
1780 
1781       if (allcolumns) {
1782         /* diagonal part A = c->A */
1783         ncols = ai[Crow+1] - ai[Crow];
1784         cols  = aj + ai[Crow];
1785         vals  = a_a + ai[Crow];
1786         i     = 0;
1787         for (k=0; k<ncols; k++) {
1788           subcols[i]   = cols[k] + cstart;
1789           subvals[i++] = vals[k];
1790         }
1791 
1792         /* off-diagonal part B = c->B */
1793         ncols = bi[Crow+1] - bi[Crow];
1794         cols  = bj + bi[Crow];
1795         vals  = b_a + bi[Crow];
1796         for (k=0; k<ncols; k++) {
1797           subcols[i]   = bmap[cols[k]];
1798           subvals[i++] = vals[k];
1799         }
1800 
1801         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1802 
1803       } else { /* !allcolumns */
1804 #if defined(PETSC_USE_CTABLE)
1805         /* diagonal part A = c->A */
1806         ncols = ai[Crow+1] - ai[Crow];
1807         cols  = aj + ai[Crow];
1808         vals  = a_a + ai[Crow];
1809         i     = 0;
1810         for (k=0; k<ncols; k++) {
1811           tcol = cmap_loc[cols[k]];
1812           if (tcol) {
1813             subcols[i]   = --tcol;
1814             subvals[i++] = vals[k];
1815           }
1816         }
1817 
1818         /* off-diagonal part B = c->B */
1819         ncols = bi[Crow+1] - bi[Crow];
1820         cols  = bj + bi[Crow];
1821         vals  = b_a + bi[Crow];
1822         for (k=0; k<ncols; k++) {
1823           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1824           if (tcol) {
1825             subcols[i]   = --tcol;
1826             subvals[i++] = vals[k];
1827           }
1828         }
1829 #else
1830         /* diagonal part A = c->A */
1831         ncols = ai[Crow+1] - ai[Crow];
1832         cols  = aj + ai[Crow];
1833         vals  = a_a + ai[Crow];
1834         i     = 0;
1835         for (k=0; k<ncols; k++) {
1836           tcol = cmap[cols[k]+cstart];
1837           if (tcol) {
1838             subcols[i]   = --tcol;
1839             subvals[i++] = vals[k];
1840           }
1841         }
1842 
1843         /* off-diagonal part B = c->B */
1844         ncols = bi[Crow+1] - bi[Crow];
1845         cols  = bj + bi[Crow];
1846         vals  = b_a + bi[Crow];
1847         for (k=0; k<ncols; k++) {
1848           tcol = cmap[bmap[cols[k]]];
1849           if (tcol) {
1850             subcols[i]   = --tcol;
1851             subvals[i++] = vals[k];
1852           }
1853         }
1854 #endif
1855         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1856       }
1857     }
1858   }
1859 
1860   /* Now assemble the off-proc rows */
1861   for (i=0; i<nrqs; i++) { /* for each requested message */
1862     /* recv values from other processes */
1863     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr);
1864     proc    = pa[idex];
1865     sbuf1_i = sbuf1[proc];
1866     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1867     ct1     = 2 + 1;
1868     ct2     = 0; /* count of received C->j */
1869     ct3     = 0; /* count of received C->j that will be inserted into submat */
1870     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1871     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1872     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1873 
1874     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1875     max1 = sbuf1_i[2];             /* num of rows */
1876     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1877       row = sbuf1_i[ct1]; /* row index of submat */
1878       if (!allcolumns) {
1879         idex = 0;
1880         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1881           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1882           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1883 #if defined(PETSC_USE_CTABLE)
1884             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1885               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1886             } else {
1887               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1888             }
1889 #else
1890             tcol = cmap[rbuf3_i[ct2]];
1891 #endif
1892             if (tcol) {
1893               subcols[idex]   = --tcol; /* may not be sorted */
1894               subvals[idex++] = rbuf4_i[ct2];
1895 
1896               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1897                For reuse, we replace received C->j with index that should be inserted to submat */
1898               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1899             }
1900           }
1901           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1902         } else { /* scall == MAT_REUSE_MATRIX */
1903           submat = submats[0];
1904           subc   = (Mat_SeqAIJ*)submat->data;
1905 
1906           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1907           for (l=0; l<nnz; l++) {
1908             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1909             subvals[idex++] = rbuf4_i[ct2];
1910           }
1911 
1912           bj = subc->j + subc->i[row]; /* sorted column indices */
1913           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1914         }
1915       } else { /* allcolumns */
1916         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1917         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1918         ct2 += nnz;
1919       }
1920     }
1921   }
1922 
1923   /* sending a->a are done */
1924   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);
1925   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1926 
1927   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1928   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1929   submats[0] = submat;
1930 
1931   /* Restore the indices */
1932   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1933   if (!allcolumns) {
1934     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1935   }
1936 
1937   /* Destroy allocated memory */
1938   for (i=0; i<nrqs; ++i) {
1939     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1940   }
1941   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1942   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1943   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1944 
1945   if (scall == MAT_INITIAL_MATRIX) {
1946     ierr = PetscFree(lens);CHKERRQ(ierr);
1947     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1948     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1949   }
1950   ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1951   ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 
1955 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1956 {
1957   PetscErrorCode ierr;
1958   PetscInt       ncol;
1959   PetscBool      colflag,allcolumns=PETSC_FALSE;
1960 
1961   PetscFunctionBegin;
1962   /* Allocate memory to hold all the submatrices */
1963   if (scall == MAT_INITIAL_MATRIX) {
1964     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1965   }
1966 
1967   /* Check for special case: each processor gets entire matrix columns */
1968   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1969   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1970   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1971 
1972   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1973   PetscFunctionReturn(0);
1974 }
1975 
1976 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1977 {
1978   PetscErrorCode ierr;
1979   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1980   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1981   Mat_SeqAIJ     *subc;
1982   Mat_SubSppt    *smat;
1983 
1984   PetscFunctionBegin;
1985   /* Check for special case: each processor has a single IS */
1986   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1987     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1988     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1989     PetscFunctionReturn(0);
1990   }
1991 
1992   /* Collect global wantallmatrix and nstages */
1993   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1994   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1995   if (!nmax) nmax = 1;
1996 
1997   if (scall == MAT_INITIAL_MATRIX) {
1998     /* Collect global wantallmatrix and nstages */
1999     if (ismax == 1 && C->rmap->N == C->cmap->N) {
2000       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2001       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2002       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2003       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2004       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2005         wantallmatrix = PETSC_TRUE;
2006 
2007         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2008       }
2009     }
2010 
2011     /* Determine the number of stages through which submatrices are done
2012        Each stage will extract nmax submatrices.
2013        nmax is determined by the matrix column dimension.
2014        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2015     */
2016     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2017 
2018     in[0] = -1*(PetscInt)wantallmatrix;
2019     in[1] = nstages;
2020     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2021     wantallmatrix = (PetscBool)(-out[0]);
2022     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2023 
2024   } else { /* MAT_REUSE_MATRIX */
2025     if (ismax) {
2026       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2027       smat = subc->submatis1;
2028     } else { /* (*submat)[0] is a dummy matrix */
2029       smat = (Mat_SubSppt*)(*submat)[0]->data;
2030     }
2031     if (!smat) {
2032       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2033       wantallmatrix = PETSC_TRUE;
2034     } else if (smat->singleis) {
2035       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2036       PetscFunctionReturn(0);
2037     } else {
2038       nstages = smat->nstages;
2039     }
2040   }
2041 
2042   if (wantallmatrix) {
2043     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2044     PetscFunctionReturn(0);
2045   }
2046 
2047   /* Allocate memory to hold all the submatrices and dummy submatrices */
2048   if (scall == MAT_INITIAL_MATRIX) {
2049     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2050   }
2051 
2052   for (i=0,pos=0; i<nstages; i++) {
2053     if (pos+nmax <= ismax) max_no = nmax;
2054     else if (pos >= ismax) max_no = 0;
2055     else                   max_no = ismax-pos;
2056 
2057     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2058     if (!max_no) {
2059       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2060         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2061         smat->nstages = nstages;
2062       }
2063       pos++; /* advance to next dummy matrix if any */
2064     } else pos += max_no;
2065   }
2066 
2067   if (ismax && scall == MAT_INITIAL_MATRIX) {
2068     /* save nstages for reuse */
2069     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2070     smat = subc->submatis1;
2071     smat->nstages = nstages;
2072   }
2073   PetscFunctionReturn(0);
2074 }
2075 
2076 /* -------------------------------------------------------------------------*/
2077 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2078 {
2079   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2080   Mat            A  = c->A;
2081   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2082   const PetscInt **icol,**irow;
2083   PetscInt       *nrow,*ncol,start;
2084   PetscErrorCode ierr;
2085   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2086   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2087   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2088   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2089   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2090 #if defined(PETSC_USE_CTABLE)
2091   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2092 #else
2093   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2094 #endif
2095   const PetscInt *irow_i;
2096   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2097   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2098   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2099   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2100   MPI_Status     *r_status3,*r_status4,*s_status4;
2101   MPI_Comm       comm;
2102   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2103   PetscMPIInt    *onodes1,*olengths1,end;
2104   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2105   Mat_SubSppt    *smat_i;
2106   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2107   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2108 
2109   PetscFunctionBegin;
2110   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2111   size = c->size;
2112   rank = c->rank;
2113 
2114   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2115   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2116 
2117   for (i=0; i<ismax; i++) {
2118     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2119     if (!issorted[i]) iscsorted = issorted[i];
2120 
2121     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2122 
2123     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2124     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2125 
2126     /* Check for special case: allcolumn */
2127     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2128     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2129     if (colflag && ncol[i] == C->cmap->N) {
2130       allcolumns[i] = PETSC_TRUE;
2131       icol[i] = NULL;
2132     } else {
2133       allcolumns[i] = PETSC_FALSE;
2134       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2135     }
2136   }
2137 
2138   if (scall == MAT_REUSE_MATRIX) {
2139     /* Assumes new rows are same length as the old rows */
2140     for (i=0; i<ismax; i++) {
2141       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2142       subc = (Mat_SeqAIJ*)submats[i]->data;
2143       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2144 
2145       /* Initial matrix as if empty */
2146       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2147 
2148       smat_i   = subc->submatis1;
2149 
2150       nrqs        = smat_i->nrqs;
2151       nrqr        = smat_i->nrqr;
2152       rbuf1       = smat_i->rbuf1;
2153       rbuf2       = smat_i->rbuf2;
2154       rbuf3       = smat_i->rbuf3;
2155       req_source2 = smat_i->req_source2;
2156 
2157       sbuf1     = smat_i->sbuf1;
2158       sbuf2     = smat_i->sbuf2;
2159       ptr       = smat_i->ptr;
2160       tmp       = smat_i->tmp;
2161       ctr       = smat_i->ctr;
2162 
2163       pa          = smat_i->pa;
2164       req_size    = smat_i->req_size;
2165       req_source1 = smat_i->req_source1;
2166 
2167       allcolumns[i] = smat_i->allcolumns;
2168       row2proc[i]   = smat_i->row2proc;
2169       rmap[i]       = smat_i->rmap;
2170       cmap[i]       = smat_i->cmap;
2171     }
2172 
2173     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2174       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2175       smat_i = (Mat_SubSppt*)submats[0]->data;
2176 
2177       nrqs        = smat_i->nrqs;
2178       nrqr        = smat_i->nrqr;
2179       rbuf1       = smat_i->rbuf1;
2180       rbuf2       = smat_i->rbuf2;
2181       rbuf3       = smat_i->rbuf3;
2182       req_source2 = smat_i->req_source2;
2183 
2184       sbuf1       = smat_i->sbuf1;
2185       sbuf2       = smat_i->sbuf2;
2186       ptr         = smat_i->ptr;
2187       tmp         = smat_i->tmp;
2188       ctr         = smat_i->ctr;
2189 
2190       pa          = smat_i->pa;
2191       req_size    = smat_i->req_size;
2192       req_source1 = smat_i->req_source1;
2193 
2194       allcolumns[0] = PETSC_FALSE;
2195     }
2196   } else { /* scall == MAT_INITIAL_MATRIX */
2197     /* Get some new tags to keep the communication clean */
2198     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2199     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2200 
2201     /* evaluate communication - mesg to who, length of mesg, and buffer space
2202      required. Based on this, buffers are allocated, and data copied into them*/
2203     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2204 
2205     for (i=0; i<ismax; i++) {
2206       jmax   = nrow[i];
2207       irow_i = irow[i];
2208 
2209       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2210       row2proc[i] = row2proc_i;
2211 
2212       if (issorted[i]) proc = 0;
2213       for (j=0; j<jmax; j++) {
2214         if (!issorted[i]) proc = 0;
2215         row = irow_i[j];
2216         while (row >= C->rmap->range[proc+1]) proc++;
2217         w4[proc]++;
2218         row2proc_i[j] = proc; /* map row index to proc */
2219       }
2220       for (j=0; j<size; j++) {
2221         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2222       }
2223     }
2224 
2225     nrqs     = 0;              /* no of outgoing messages */
2226     msz      = 0;              /* total mesg length (for all procs) */
2227     w1[rank] = 0;              /* no mesg sent to self */
2228     w3[rank] = 0;
2229     for (i=0; i<size; i++) {
2230       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2231     }
2232     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2233     for (i=0,j=0; i<size; i++) {
2234       if (w1[i]) { pa[j] = i; j++; }
2235     }
2236 
2237     /* Each message would have a header = 1 + 2*(no of IS) + data */
2238     for (i=0; i<nrqs; i++) {
2239       j      = pa[i];
2240       w1[j] += w2[j] + 2* w3[j];
2241       msz   += w1[j];
2242     }
2243     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2244 
2245     /* Determine the number of messages to expect, their lengths, from from-ids */
2246     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2247     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2248 
2249     /* Now post the Irecvs corresponding to these messages */
2250     tag0 = ((PetscObject)C)->tag;
2251     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2252 
2253     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2254     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2255 
2256     /* Allocate Memory for outgoing messages */
2257     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2258     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2259     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2260 
2261     {
2262       PetscInt *iptr = tmp;
2263       k    = 0;
2264       for (i=0; i<nrqs; i++) {
2265         j        = pa[i];
2266         iptr    += k;
2267         sbuf1[j] = iptr;
2268         k        = w1[j];
2269       }
2270     }
2271 
2272     /* Form the outgoing messages. Initialize the header space */
2273     for (i=0; i<nrqs; i++) {
2274       j           = pa[i];
2275       sbuf1[j][0] = 0;
2276       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2277       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2278     }
2279 
2280     /* Parse the isrow and copy data into outbuf */
2281     for (i=0; i<ismax; i++) {
2282       row2proc_i = row2proc[i];
2283       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2284       irow_i = irow[i];
2285       jmax   = nrow[i];
2286       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2287         proc = row2proc_i[j];
2288         if (proc != rank) { /* copy to the outgoing buf*/
2289           ctr[proc]++;
2290           *ptr[proc] = irow_i[j];
2291           ptr[proc]++;
2292         }
2293       }
2294       /* Update the headers for the current IS */
2295       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2296         if ((ctr_j = ctr[j])) {
2297           sbuf1_j        = sbuf1[j];
2298           k              = ++sbuf1_j[0];
2299           sbuf1_j[2*k]   = ctr_j;
2300           sbuf1_j[2*k-1] = i;
2301         }
2302       }
2303     }
2304 
2305     /*  Now  post the sends */
2306     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2307     for (i=0; i<nrqs; ++i) {
2308       j    = pa[i];
2309       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr);
2310     }
2311 
2312     /* Post Receives to capture the buffer size */
2313     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2314     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2315     rbuf2[0] = tmp + msz;
2316     for (i=1; i<nrqs; ++i) {
2317       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2318     }
2319     for (i=0; i<nrqs; ++i) {
2320       j    = pa[i];
2321       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
2322     }
2323 
2324     /* Send to other procs the buf size they should allocate */
2325     /* Receive messages*/
2326     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2327     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2328     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2329     {
2330       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2331       PetscInt   *sbuf2_i;
2332 
2333       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
2334       for (i=0; i<nrqr; ++i) {
2335         req_size[i] = 0;
2336         rbuf1_i        = rbuf1[i];
2337         start          = 2*rbuf1_i[0] + 1;
2338         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
2339         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2340         sbuf2_i        = sbuf2[i];
2341         for (j=start; j<end; j++) {
2342           id              = rbuf1_i[j] - rstart;
2343           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2344           sbuf2_i[j]      = ncols;
2345           req_size[i] += ncols;
2346         }
2347         req_source1[i] = r_status1[i].MPI_SOURCE;
2348         /* form the header */
2349         sbuf2_i[0] = req_size[i];
2350         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2351 
2352         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
2353       }
2354     }
2355     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2356     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2357     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2358 
2359     /* Receive messages*/
2360     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2361     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2362 
2363     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
2364     for (i=0; i<nrqs; ++i) {
2365       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2366       req_source2[i] = r_status2[i].MPI_SOURCE;
2367       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
2368     }
2369     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2370     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2371 
2372     /* Wait on sends1 and sends2 */
2373     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2374     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2375 
2376     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);}
2377     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);}
2378     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2379     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2380     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2381     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2382 
2383     /* Now allocate sending buffers for a->j, and send them off */
2384     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2385     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2386     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2387     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2388 
2389     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2390     {
2391       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2392       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2393       PetscInt cend = C->cmap->rend;
2394       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2395 
2396       for (i=0; i<nrqr; i++) {
2397         rbuf1_i   = rbuf1[i];
2398         sbuf_aj_i = sbuf_aj[i];
2399         ct1       = 2*rbuf1_i[0] + 1;
2400         ct2       = 0;
2401         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2402           kmax = rbuf1[i][2*j];
2403           for (k=0; k<kmax; k++,ct1++) {
2404             row    = rbuf1_i[ct1] - rstart;
2405             nzA    = a_i[row+1] - a_i[row];
2406             nzB    = b_i[row+1] - b_i[row];
2407             ncols  = nzA + nzB;
2408             cworkA = a_j + a_i[row];
2409             cworkB = b_j + b_i[row];
2410 
2411             /* load the column indices for this row into cols */
2412             cols = sbuf_aj_i + ct2;
2413 
2414             lwrite = 0;
2415             for (l=0; l<nzB; l++) {
2416               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2417             }
2418             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2419             for (l=0; l<nzB; l++) {
2420               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2421             }
2422 
2423             ct2 += ncols;
2424           }
2425         }
2426         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
2427       }
2428     }
2429     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2430 
2431     /* create col map: global col of C -> local col of submatrices */
2432     {
2433       const PetscInt *icol_i;
2434 #if defined(PETSC_USE_CTABLE)
2435       for (i=0; i<ismax; i++) {
2436         if (!allcolumns[i]) {
2437           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2438 
2439           jmax   = ncol[i];
2440           icol_i = icol[i];
2441           cmap_i = cmap[i];
2442           for (j=0; j<jmax; j++) {
2443             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2444           }
2445         } else cmap[i] = NULL;
2446       }
2447 #else
2448       for (i=0; i<ismax; i++) {
2449         if (!allcolumns[i]) {
2450           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2451           jmax   = ncol[i];
2452           icol_i = icol[i];
2453           cmap_i = cmap[i];
2454           for (j=0; j<jmax; j++) {
2455             cmap_i[icol_i[j]] = j+1;
2456           }
2457         } else cmap[i] = NULL;
2458       }
2459 #endif
2460     }
2461 
2462     /* Create lens which is required for MatCreate... */
2463     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2464     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2465 
2466     if (ismax) {
2467       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2468     }
2469     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2470 
2471     /* Update lens from local data */
2472     for (i=0; i<ismax; i++) {
2473       row2proc_i = row2proc[i];
2474       jmax = nrow[i];
2475       if (!allcolumns[i]) cmap_i = cmap[i];
2476       irow_i = irow[i];
2477       lens_i = lens[i];
2478       for (j=0; j<jmax; j++) {
2479         row = irow_i[j];
2480         proc = row2proc_i[j];
2481         if (proc == rank) {
2482           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2483           if (!allcolumns[i]) {
2484             for (k=0; k<ncols; k++) {
2485 #if defined(PETSC_USE_CTABLE)
2486               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2487 #else
2488               tcol = cmap_i[cols[k]];
2489 #endif
2490               if (tcol) lens_i[j]++;
2491             }
2492           } else { /* allcolumns */
2493             lens_i[j] = ncols;
2494           }
2495           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2496         }
2497       }
2498     }
2499 
2500     /* Create row map: global row of C -> local row of submatrices */
2501 #if defined(PETSC_USE_CTABLE)
2502     for (i=0; i<ismax; i++) {
2503       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2504       irow_i = irow[i];
2505       jmax   = nrow[i];
2506       for (j=0; j<jmax; j++) {
2507       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2508       }
2509     }
2510 #else
2511     for (i=0; i<ismax; i++) {
2512       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2513       rmap_i = rmap[i];
2514       irow_i = irow[i];
2515       jmax   = nrow[i];
2516       for (j=0; j<jmax; j++) {
2517         rmap_i[irow_i[j]] = j;
2518       }
2519     }
2520 #endif
2521 
2522     /* Update lens from offproc data */
2523     {
2524       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2525 
2526       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
2527       for (tmp2=0; tmp2<nrqs; tmp2++) {
2528         sbuf1_i = sbuf1[pa[tmp2]];
2529         jmax    = sbuf1_i[0];
2530         ct1     = 2*jmax+1;
2531         ct2     = 0;
2532         rbuf2_i = rbuf2[tmp2];
2533         rbuf3_i = rbuf3[tmp2];
2534         for (j=1; j<=jmax; j++) {
2535           is_no  = sbuf1_i[2*j-1];
2536           max1   = sbuf1_i[2*j];
2537           lens_i = lens[is_no];
2538           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2539           rmap_i = rmap[is_no];
2540           for (k=0; k<max1; k++,ct1++) {
2541 #if defined(PETSC_USE_CTABLE)
2542             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2543             row--;
2544             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2545 #else
2546             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2547 #endif
2548             max2 = rbuf2_i[ct1];
2549             for (l=0; l<max2; l++,ct2++) {
2550               if (!allcolumns[is_no]) {
2551 #if defined(PETSC_USE_CTABLE)
2552                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2553 #else
2554                 tcol = cmap_i[rbuf3_i[ct2]];
2555 #endif
2556                 if (tcol) lens_i[row]++;
2557               } else { /* allcolumns */
2558                 lens_i[row]++; /* lens_i[row] += max2 ? */
2559               }
2560             }
2561           }
2562         }
2563       }
2564     }
2565     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2566     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);}
2567     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2568     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2569 
2570     /* Create the submatrices */
2571     for (i=0; i<ismax; i++) {
2572       PetscInt    rbs,cbs;
2573 
2574       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2575       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2576 
2577       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2578       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2579 
2580       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2581       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2582       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2583 
2584       /* create struct Mat_SubSppt and attached it to submat */
2585       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2586       subc = (Mat_SeqAIJ*)submats[i]->data;
2587       subc->submatis1 = smat_i;
2588 
2589       smat_i->destroy          = submats[i]->ops->destroy;
2590       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2591       submats[i]->factortype   = C->factortype;
2592 
2593       smat_i->id          = i;
2594       smat_i->nrqs        = nrqs;
2595       smat_i->nrqr        = nrqr;
2596       smat_i->rbuf1       = rbuf1;
2597       smat_i->rbuf2       = rbuf2;
2598       smat_i->rbuf3       = rbuf3;
2599       smat_i->sbuf2       = sbuf2;
2600       smat_i->req_source2 = req_source2;
2601 
2602       smat_i->sbuf1       = sbuf1;
2603       smat_i->ptr         = ptr;
2604       smat_i->tmp         = tmp;
2605       smat_i->ctr         = ctr;
2606 
2607       smat_i->pa           = pa;
2608       smat_i->req_size     = req_size;
2609       smat_i->req_source1  = req_source1;
2610 
2611       smat_i->allcolumns  = allcolumns[i];
2612       smat_i->singleis    = PETSC_FALSE;
2613       smat_i->row2proc    = row2proc[i];
2614       smat_i->rmap        = rmap[i];
2615       smat_i->cmap        = cmap[i];
2616     }
2617 
2618     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2619       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2620       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2621       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2622 
2623       /* create struct Mat_SubSppt and attached it to submat */
2624       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2625       submats[0]->data = (void*)smat_i;
2626 
2627       smat_i->destroy          = submats[0]->ops->destroy;
2628       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2629       submats[0]->factortype   = C->factortype;
2630 
2631       smat_i->id          = 0;
2632       smat_i->nrqs        = nrqs;
2633       smat_i->nrqr        = nrqr;
2634       smat_i->rbuf1       = rbuf1;
2635       smat_i->rbuf2       = rbuf2;
2636       smat_i->rbuf3       = rbuf3;
2637       smat_i->sbuf2       = sbuf2;
2638       smat_i->req_source2 = req_source2;
2639 
2640       smat_i->sbuf1       = sbuf1;
2641       smat_i->ptr         = ptr;
2642       smat_i->tmp         = tmp;
2643       smat_i->ctr         = ctr;
2644 
2645       smat_i->pa           = pa;
2646       smat_i->req_size     = req_size;
2647       smat_i->req_source1  = req_source1;
2648 
2649       smat_i->allcolumns  = PETSC_FALSE;
2650       smat_i->singleis    = PETSC_FALSE;
2651       smat_i->row2proc    = NULL;
2652       smat_i->rmap        = NULL;
2653       smat_i->cmap        = NULL;
2654     }
2655 
2656     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2657     ierr = PetscFree(lens);CHKERRQ(ierr);
2658     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2659     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2660 
2661   } /* endof scall == MAT_INITIAL_MATRIX */
2662 
2663   /* Post recv matrix values */
2664   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2665   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2666   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2667   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2668   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2669   for (i=0; i<nrqs; ++i) {
2670     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2671     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
2672   }
2673 
2674   /* Allocate sending buffers for a->a, and send them off */
2675   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2676   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2677   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2678   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2679 
2680   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2681   {
2682     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2683     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2684     PetscInt    cend   = C->cmap->rend;
2685     PetscInt    *b_j   = b->j;
2686     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2687 
2688     ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2689     ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2690     for (i=0; i<nrqr; i++) {
2691       rbuf1_i   = rbuf1[i];
2692       sbuf_aa_i = sbuf_aa[i];
2693       ct1       = 2*rbuf1_i[0]+1;
2694       ct2       = 0;
2695       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2696         kmax = rbuf1_i[2*j];
2697         for (k=0; k<kmax; k++,ct1++) {
2698           row    = rbuf1_i[ct1] - rstart;
2699           nzA    = a_i[row+1] - a_i[row];
2700           nzB    = b_i[row+1] - b_i[row];
2701           ncols  = nzA + nzB;
2702           cworkB = b_j + b_i[row];
2703           vworkA = a_a + a_i[row];
2704           vworkB = b_a + b_i[row];
2705 
2706           /* load the column values for this row into vals*/
2707           vals = sbuf_aa_i+ct2;
2708 
2709           lwrite = 0;
2710           for (l=0; l<nzB; l++) {
2711             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2712           }
2713           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2714           for (l=0; l<nzB; l++) {
2715             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2716           }
2717 
2718           ct2 += ncols;
2719         }
2720       }
2721       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
2722     }
2723     ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2724     ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2725   }
2726 
2727   /* Assemble the matrices */
2728   /* First assemble the local rows */
2729   for (i=0; i<ismax; i++) {
2730     row2proc_i = row2proc[i];
2731     subc      = (Mat_SeqAIJ*)submats[i]->data;
2732     imat_ilen = subc->ilen;
2733     imat_j    = subc->j;
2734     imat_i    = subc->i;
2735     imat_a    = subc->a;
2736 
2737     if (!allcolumns[i]) cmap_i = cmap[i];
2738     rmap_i = rmap[i];
2739     irow_i = irow[i];
2740     jmax   = nrow[i];
2741     for (j=0; j<jmax; j++) {
2742       row  = irow_i[j];
2743       proc = row2proc_i[j];
2744       if (proc == rank) {
2745         old_row = row;
2746 #if defined(PETSC_USE_CTABLE)
2747         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2748         row--;
2749 #else
2750         row = rmap_i[row];
2751 #endif
2752         ilen_row = imat_ilen[row];
2753         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2754         mat_i    = imat_i[row];
2755         mat_a    = imat_a + mat_i;
2756         mat_j    = imat_j + mat_i;
2757         if (!allcolumns[i]) {
2758           for (k=0; k<ncols; k++) {
2759 #if defined(PETSC_USE_CTABLE)
2760             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2761 #else
2762             tcol = cmap_i[cols[k]];
2763 #endif
2764             if (tcol) {
2765               *mat_j++ = tcol - 1;
2766               *mat_a++ = vals[k];
2767               ilen_row++;
2768             }
2769           }
2770         } else { /* allcolumns */
2771           for (k=0; k<ncols; k++) {
2772             *mat_j++ = cols[k];  /* global col index! */
2773             *mat_a++ = vals[k];
2774             ilen_row++;
2775           }
2776         }
2777         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2778 
2779         imat_ilen[row] = ilen_row;
2780       }
2781     }
2782   }
2783 
2784   /* Now assemble the off proc rows */
2785   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRMPI(ierr);
2786   for (tmp2=0; tmp2<nrqs; tmp2++) {
2787     sbuf1_i = sbuf1[pa[tmp2]];
2788     jmax    = sbuf1_i[0];
2789     ct1     = 2*jmax + 1;
2790     ct2     = 0;
2791     rbuf2_i = rbuf2[tmp2];
2792     rbuf3_i = rbuf3[tmp2];
2793     rbuf4_i = rbuf4[tmp2];
2794     for (j=1; j<=jmax; j++) {
2795       is_no     = sbuf1_i[2*j-1];
2796       rmap_i    = rmap[is_no];
2797       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2798       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2799       imat_ilen = subc->ilen;
2800       imat_j    = subc->j;
2801       imat_i    = subc->i;
2802       imat_a    = subc->a;
2803       max1      = sbuf1_i[2*j];
2804       for (k=0; k<max1; k++,ct1++) {
2805         row = sbuf1_i[ct1];
2806 #if defined(PETSC_USE_CTABLE)
2807         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2808         row--;
2809 #else
2810         row = rmap_i[row];
2811 #endif
2812         ilen  = imat_ilen[row];
2813         mat_i = imat_i[row];
2814         mat_a = imat_a + mat_i;
2815         mat_j = imat_j + mat_i;
2816         max2  = rbuf2_i[ct1];
2817         if (!allcolumns[is_no]) {
2818           for (l=0; l<max2; l++,ct2++) {
2819 #if defined(PETSC_USE_CTABLE)
2820             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2821 #else
2822             tcol = cmap_i[rbuf3_i[ct2]];
2823 #endif
2824             if (tcol) {
2825               *mat_j++ = tcol - 1;
2826               *mat_a++ = rbuf4_i[ct2];
2827               ilen++;
2828             }
2829           }
2830         } else { /* allcolumns */
2831           for (l=0; l<max2; l++,ct2++) {
2832             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2833             *mat_a++ = rbuf4_i[ct2];
2834             ilen++;
2835           }
2836         }
2837         imat_ilen[row] = ilen;
2838       }
2839     }
2840   }
2841 
2842   if (!iscsorted) { /* sort column indices of the rows */
2843     for (i=0; i<ismax; i++) {
2844       subc      = (Mat_SeqAIJ*)submats[i]->data;
2845       imat_j    = subc->j;
2846       imat_i    = subc->i;
2847       imat_a    = subc->a;
2848       imat_ilen = subc->ilen;
2849 
2850       if (allcolumns[i]) continue;
2851       jmax = nrow[i];
2852       for (j=0; j<jmax; j++) {
2853         mat_i = imat_i[j];
2854         mat_a = imat_a + mat_i;
2855         mat_j = imat_j + mat_i;
2856         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2857       }
2858     }
2859   }
2860 
2861   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2862   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2863   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);}
2864   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2865   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2866 
2867   /* Restore the indices */
2868   for (i=0; i<ismax; i++) {
2869     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2870     if (!allcolumns[i]) {
2871       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2872     }
2873   }
2874 
2875   for (i=0; i<ismax; i++) {
2876     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2877     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2878   }
2879 
2880   /* Destroy allocated memory */
2881   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2882   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2883   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2884 
2885   for (i=0; i<nrqs; ++i) {
2886     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2887   }
2888   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2889 
2890   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2891   PetscFunctionReturn(0);
2892 }
2893 
2894 /*
2895  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2896  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2897  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2898  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2899  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2900  state, and needs to be "assembled" later by compressing B's column space.
2901 
2902  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2903  Following this call, C->A & C->B have been created, even if empty.
2904  */
2905 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2906 {
2907   /* If making this function public, change the error returned in this function away from _PLIB. */
2908   PetscErrorCode ierr;
2909   Mat_MPIAIJ     *aij;
2910   Mat_SeqAIJ     *Baij;
2911   PetscBool      seqaij,Bdisassembled;
2912   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2913   PetscScalar    v;
2914   const PetscInt *rowindices,*colindices;
2915 
2916   PetscFunctionBegin;
2917   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2918   if (A) {
2919     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2920     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2921     if (rowemb) {
2922       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2923       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2924     } else {
2925       if (C->rmap->n != A->rmap->n) {
2926         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2927       }
2928     }
2929     if (dcolemb) {
2930       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2931       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2932     } else {
2933       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2934     }
2935   }
2936   if (B) {
2937     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2938     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2939     if (rowemb) {
2940       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2941       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2942     } else {
2943       if (C->rmap->n != B->rmap->n) {
2944         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2945       }
2946     }
2947     if (ocolemb) {
2948       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2949       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2950     } else {
2951       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2952     }
2953   }
2954 
2955   aij = (Mat_MPIAIJ*)C->data;
2956   if (!aij->A) {
2957     /* Mimic parts of MatMPIAIJSetPreallocation() */
2958     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2959     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2960     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2961     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2962     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2963   }
2964   if (A) {
2965     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2966   } else {
2967     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2968   }
2969   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2970     /*
2971       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2972       need to "disassemble" B -- convert it to using C's global indices.
2973       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2974 
2975       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2976       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2977 
2978       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2979       At least avoid calling MatSetValues() and the implied searches?
2980     */
2981 
2982     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2983 #if defined(PETSC_USE_CTABLE)
2984       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2985 #else
2986       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2987       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2988       if (aij->B) {
2989         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2990       }
2991 #endif
2992       ngcol = 0;
2993       if (aij->lvec) {
2994         ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2995       }
2996       if (aij->garray) {
2997         ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2998         ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2999       }
3000       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
3001       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
3002     }
3003     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
3004       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
3005     }
3006     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3007       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
3008     }
3009   }
3010   Bdisassembled = PETSC_FALSE;
3011   if (!aij->B) {
3012     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3013     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3014     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3015     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3016     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3017     Bdisassembled = PETSC_TRUE;
3018   }
3019   if (B) {
3020     Baij = (Mat_SeqAIJ*)B->data;
3021     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3022       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3023       for (i=0; i<B->rmap->n; i++) {
3024         nz[i] = Baij->i[i+1] - Baij->i[i];
3025       }
3026       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3027       ierr = PetscFree(nz);CHKERRQ(ierr);
3028     }
3029 
3030     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3031     shift = rend-rstart;
3032     count = 0;
3033     rowindices = NULL;
3034     colindices = NULL;
3035     if (rowemb) {
3036       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3037     }
3038     if (ocolemb) {
3039       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3040     }
3041     for (i=0; i<B->rmap->n; i++) {
3042       PetscInt row;
3043       row = i;
3044       if (rowindices) row = rowindices[i];
3045       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3046         col  = Baij->j[count];
3047         if (colindices) col = colindices[col];
3048         if (Bdisassembled && col>=rstart) col += shift;
3049         v    = Baij->a[count];
3050         ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3051         ++count;
3052       }
3053     }
3054     /* No assembly for aij->B is necessary. */
3055     /* FIXME: set aij->B's nonzerostate correctly. */
3056   } else {
3057     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3058   }
3059   C->preallocated  = PETSC_TRUE;
3060   C->was_assembled = PETSC_FALSE;
3061   C->assembled     = PETSC_FALSE;
3062    /*
3063       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3064       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3065    */
3066   PetscFunctionReturn(0);
3067 }
3068 
3069 /*
3070   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3071  */
3072 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3073 {
3074   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3075 
3076   PetscFunctionBegin;
3077   PetscValidPointer(A,2);
3078   PetscValidPointer(B,3);
3079   /* FIXME: make sure C is assembled */
3080   *A = aij->A;
3081   *B = aij->B;
3082   /* Note that we don't incref *A and *B, so be careful! */
3083   PetscFunctionReturn(0);
3084 }
3085 
3086 /*
3087   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3088   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3089 */
3090 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3091                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3092                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3093                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3094                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3095 {
3096   PetscErrorCode ierr;
3097   PetscMPIInt    size,flag;
3098   PetscInt       i,ii,cismax,ispar;
3099   Mat            *A,*B;
3100   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3101 
3102   PetscFunctionBegin;
3103   if (!ismax) PetscFunctionReturn(0);
3104 
3105   for (i = 0, cismax = 0; i < ismax; ++i) {
3106     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr);
3107     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3108     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr);
3109     if (size > 1) ++cismax;
3110   }
3111 
3112   /*
3113      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3114      ispar counts the number of parallel ISs across C's comm.
3115   */
3116   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3117   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3118     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3119     PetscFunctionReturn(0);
3120   }
3121 
3122   /* if (ispar) */
3123   /*
3124     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3125     These are used to extract the off-diag portion of the resulting parallel matrix.
3126     The row IS for the off-diag portion is the same as for the diag portion,
3127     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3128   */
3129   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3130   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3131   for (i = 0, ii = 0; i < ismax; ++i) {
3132     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3133     if (size > 1) {
3134       /*
3135          TODO: This is the part that's ***NOT SCALABLE***.
3136          To fix this we need to extract just the indices of C's nonzero columns
3137          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3138          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3139          to be done without serializing on the IS list, so, most likely, it is best
3140          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3141       */
3142       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3143       /* Now we have to
3144          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3145              were sorted on each rank, concatenated they might no longer be sorted;
3146          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3147              indices in the nondecreasing order to the original index positions.
3148          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3149       */
3150       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3151       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3152       ++ii;
3153     }
3154   }
3155   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3156   for (i = 0, ii = 0; i < ismax; ++i) {
3157     PetscInt       j,issize;
3158     const PetscInt *indices;
3159 
3160     /*
3161        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3162      */
3163     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3164     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3165     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3166     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3167     for (j = 1; j < issize; ++j) {
3168       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3169     }
3170     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3171     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3172     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3173     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3174     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3175     for (j = 1; j < issize; ++j) {
3176       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3177     }
3178     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3179     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3180     if (size > 1) {
3181       cisrow[ii] = isrow[i];
3182       ++ii;
3183     }
3184   }
3185   /*
3186     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3187     array of sequential matrices underlying the resulting parallel matrices.
3188     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3189     contain duplicates.
3190 
3191     There are as many diag matrices as there are original index sets. There are only as many parallel
3192     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3193 
3194     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3195     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3196       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3197       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3198       with A[i] and B[ii] extracted from the corresponding MPI submat.
3199     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3200       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3201       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3202       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3203       values into A[i] and B[ii] sitting inside the corresponding submat.
3204     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3205       A[i], B[ii], AA[i] or BB[ii] matrices.
3206   */
3207   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3208   if (scall == MAT_INITIAL_MATRIX) {
3209     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3210   }
3211 
3212   /* Now obtain the sequential A and B submatrices separately. */
3213   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3214   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3215   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3216 
3217   /*
3218     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3219     matrices A & B have been extracted directly into the parallel matrices containing them, or
3220     simply into the sequential matrix identical with the corresponding A (if size == 1).
3221     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3222     to have the same sparsity pattern.
3223     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3224     must be constructed for C. This is done by setseqmat(s).
3225   */
3226   for (i = 0, ii = 0; i < ismax; ++i) {
3227     /*
3228        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3229        That way we can avoid sorting and computing permutations when reusing.
3230        To this end:
3231         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3232         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3233           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3234     */
3235     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3236 
3237     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3238     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3239     if (size > 1) {
3240       if (scall == MAT_INITIAL_MATRIX) {
3241         ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3242         ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3243         ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3244         ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3245         ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3246       }
3247       /*
3248         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3249       */
3250       {
3251         Mat AA = A[i],BB = B[ii];
3252 
3253         if (AA || BB) {
3254           ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3255           ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3256           ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3257         }
3258         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3259       }
3260       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3261       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3262       ++ii;
3263     } else { /* if (size == 1) */
3264       if (scall == MAT_REUSE_MATRIX) {
3265         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3266       }
3267       if (isrow_p[i] || iscol_p[i]) {
3268         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3269         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3270         /* Otherwise A is extracted straight into (*submats)[i]. */
3271         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3272         ierr = MatDestroy(A+i);CHKERRQ(ierr);
3273       } else (*submat)[i] = A[i];
3274     }
3275     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3276     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3277   }
3278   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3279   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3280   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3281   ierr = PetscFree(A);CHKERRQ(ierr);
3282   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3283   PetscFunctionReturn(0);
3284 }
3285 
3286 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3287 {
3288   PetscErrorCode ierr;
3289 
3290   PetscFunctionBegin;
3291   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3292   PetscFunctionReturn(0);
3293 }
3294