xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision efd4aadf157bf1ba2d80c2be092fcf4247860003)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1037 /*
1038     Every processor gets the entire matrix
1039 */
1040 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1041 {
1042   Mat            B;
1043   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1044   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1045   PetscErrorCode ierr;
1046   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1047   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1048   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1049   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1050 
1051   PetscFunctionBegin;
1052   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1053   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1054 
1055   if (scall == MAT_INITIAL_MATRIX) {
1056     /* ----------------------------------------------------------------
1057          Tell every processor the number of nonzeros per row
1058     */
1059     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1060     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1061       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1062     }
1063     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1064     for (i=0; i<size; i++) {
1065       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1066       displs[i]     = A->rmap->range[i];
1067     }
1068 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1069     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1070 #else
1071     sendcount = A->rmap->rend - A->rmap->rstart;
1072     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1073 #endif
1074     /* ---------------------------------------------------------------
1075          Create the sequential matrix of the same type as the local block diagonal
1076     */
1077     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1078     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1079     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1080     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1081     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1082     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1083     **Bin = B;
1084     b     = (Mat_SeqAIJ*)B->data;
1085 
1086     /*--------------------------------------------------------------------
1087        Copy my part of matrix column indices over
1088     */
1089     sendcount  = ad->nz + bd->nz;
1090     jsendbuf   = b->j + b->i[rstarts[rank]];
1091     a_jsendbuf = ad->j;
1092     b_jsendbuf = bd->j;
1093     n          = A->rmap->rend - A->rmap->rstart;
1094     cnt        = 0;
1095     for (i=0; i<n; i++) {
1096 
1097       /* put in lower diagonal portion */
1098       m = bd->i[i+1] - bd->i[i];
1099       while (m > 0) {
1100         /* is it above diagonal (in bd (compressed) numbering) */
1101         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1102         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1103         m--;
1104       }
1105 
1106       /* put in diagonal portion */
1107       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1108         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1109       }
1110 
1111       /* put in upper diagonal portion */
1112       while (m-- > 0) {
1113         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1114       }
1115     }
1116     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1117 
1118     /*--------------------------------------------------------------------
1119        Gather all column indices to all processors
1120     */
1121     for (i=0; i<size; i++) {
1122       recvcounts[i] = 0;
1123       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1124         recvcounts[i] += lens[j];
1125       }
1126     }
1127     displs[0] = 0;
1128     for (i=1; i<size; i++) {
1129       displs[i] = displs[i-1] + recvcounts[i-1];
1130     }
1131 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1132     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1133 #else
1134     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1135 #endif
1136     /*--------------------------------------------------------------------
1137         Assemble the matrix into useable form (note numerical values not yet set)
1138     */
1139     /* set the b->ilen (length of each row) values */
1140     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1141     /* set the b->i indices */
1142     b->i[0] = 0;
1143     for (i=1; i<=A->rmap->N; i++) {
1144       b->i[i] = b->i[i-1] + lens[i-1];
1145     }
1146     ierr = PetscFree(lens);CHKERRQ(ierr);
1147     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1148     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149 
1150   } else {
1151     B = **Bin;
1152     b = (Mat_SeqAIJ*)B->data;
1153   }
1154 
1155   /*--------------------------------------------------------------------
1156        Copy my part of matrix numerical values into the values location
1157   */
1158   if (flag == MAT_GET_VALUES) {
1159     sendcount = ad->nz + bd->nz;
1160     sendbuf   = b->a + b->i[rstarts[rank]];
1161     a_sendbuf = ad->a;
1162     b_sendbuf = bd->a;
1163     b_sendj   = bd->j;
1164     n         = A->rmap->rend - A->rmap->rstart;
1165     cnt       = 0;
1166     for (i=0; i<n; i++) {
1167 
1168       /* put in lower diagonal portion */
1169       m = bd->i[i+1] - bd->i[i];
1170       while (m > 0) {
1171         /* is it above diagonal (in bd (compressed) numbering) */
1172         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1173         sendbuf[cnt++] = *b_sendbuf++;
1174         m--;
1175         b_sendj++;
1176       }
1177 
1178       /* put in diagonal portion */
1179       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1180         sendbuf[cnt++] = *a_sendbuf++;
1181       }
1182 
1183       /* put in upper diagonal portion */
1184       while (m-- > 0) {
1185         sendbuf[cnt++] = *b_sendbuf++;
1186         b_sendj++;
1187       }
1188     }
1189     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1190 
1191     /* -----------------------------------------------------------------
1192        Gather all numerical values to all processors
1193     */
1194     if (!recvcounts) {
1195       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1196     }
1197     for (i=0; i<size; i++) {
1198       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1199     }
1200     displs[0] = 0;
1201     for (i=1; i<size; i++) {
1202       displs[i] = displs[i-1] + recvcounts[i-1];
1203     }
1204     recvbuf = b->a;
1205 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1206     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1207 #else
1208     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1209 #endif
1210   }  /* endof (flag == MAT_GET_VALUES) */
1211   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1212 
1213   if (A->symmetric) {
1214     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1215   } else if (A->hermitian) {
1216     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1217   } else if (A->structurally_symmetric) {
1218     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1219   }
1220   PetscFunctionReturn(0);
1221 }
1222 
1223 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1224 {
1225   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1226   Mat            submat,A = c->A,B = c->B;
1227   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1228   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1229   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1230   const PetscInt *icol,*irow;
1231   PetscInt       nrow,ncol,start;
1232   PetscErrorCode ierr;
1233   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1234   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1235   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1236   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1237   PetscInt       *lens,rmax,ncols,*cols,Crow;
1238 #if defined(PETSC_USE_CTABLE)
1239   PetscTable     cmap,rmap;
1240   PetscInt       *cmap_loc,*rmap_loc;
1241 #else
1242   PetscInt       *cmap,*rmap;
1243 #endif
1244   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1245   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1246   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1247   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1248   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1249   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1250   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1251   MPI_Comm       comm;
1252   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1253   PetscMPIInt    *onodes1,*olengths1,idex,end;
1254   Mat_SubSppt    *smatis1;
1255   PetscBool      isrowsorted;
1256 
1257   PetscFunctionBegin;
1258   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1259 
1260   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1261   size = c->size;
1262   rank = c->rank;
1263 
1264   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1265   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1266   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1267   if (allcolumns) {
1268     icol = NULL;
1269     ncol = C->cmap->N;
1270   } else {
1271     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1272     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1273   }
1274 
1275   if (scall == MAT_INITIAL_MATRIX) {
1276     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1277 
1278     /* Get some new tags to keep the communication clean */
1279     tag1 = ((PetscObject)C)->tag;
1280     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1281     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1282 
1283     /* evaluate communication - mesg to who, length of mesg, and buffer space
1284      required. Based on this, buffers are allocated, and data copied into them */
1285     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1286     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1287 
1288     /* w1[proc] = num of rows owned by proc -- to be requested */
1289     proc = 0;
1290     nrqs = 0; /* num of outgoing messages */
1291     for (j=0; j<nrow; j++) {
1292       row  = irow[j];
1293       if (!isrowsorted) proc = 0;
1294       while (row >= C->rmap->range[proc+1]) proc++;
1295       w1[proc]++;
1296       row2proc[j] = proc; /* map row index to proc */
1297 
1298       if (proc != rank && !w2[proc]) {
1299         w2[proc] = 1; nrqs++;
1300       }
1301     }
1302     w1[rank] = 0;  /* rows owned by self will not be requested */
1303 
1304     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1305     for (proc=0,j=0; proc<size; proc++) {
1306       if (w1[proc]) { pa[j++] = proc;}
1307     }
1308 
1309     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1310     msz = 0;              /* total mesg length (for all procs) */
1311     for (i=0; i<nrqs; i++) {
1312       proc      = pa[i];
1313       w1[proc] += 3;
1314       msz      += w1[proc];
1315     }
1316     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1317 
1318     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1319     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1320     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1321 
1322     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1323        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1324     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1325 
1326     /* Now post the Irecvs corresponding to these messages */
1327     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1328 
1329     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1330     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1331 
1332     /* Allocate Memory for outgoing messages */
1333     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1334     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1335     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1336 
1337     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1338     iptr = tmp;
1339     for (i=0; i<nrqs; i++) {
1340       proc        = pa[i];
1341       sbuf1[proc] = iptr;
1342       iptr       += w1[proc];
1343     }
1344 
1345     /* Form the outgoing messages */
1346     /* Initialize the header space */
1347     for (i=0; i<nrqs; i++) {
1348       proc      = pa[i];
1349       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1350       ptr[proc] = sbuf1[proc] + 3;
1351     }
1352 
1353     /* Parse the isrow and copy data into outbuf */
1354     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1355     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1356       proc = row2proc[j];
1357       if (proc != rank) { /* copy to the outgoing buf*/
1358         *ptr[proc] = irow[j];
1359         ctr[proc]++; ptr[proc]++;
1360       }
1361     }
1362 
1363     /* Update the headers for the current IS */
1364     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1365       if ((ctr_j = ctr[j])) {
1366         sbuf1_j        = sbuf1[j];
1367         k              = ++sbuf1_j[0];
1368         sbuf1_j[2*k]   = ctr_j;
1369         sbuf1_j[2*k-1] = 0;
1370       }
1371     }
1372 
1373     /* Now post the sends */
1374     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1375     for (i=0; i<nrqs; ++i) {
1376       proc = pa[i];
1377       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1378     }
1379 
1380     /* Post Receives to capture the buffer size */
1381     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1382     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1383 
1384     rbuf2[0] = tmp + msz;
1385     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1386 
1387     for (i=0; i<nrqs; ++i) {
1388       proc = pa[i];
1389       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1390     }
1391 
1392     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1393 
1394     /* Send to other procs the buf size they should allocate */
1395     /* Receive messages*/
1396     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1397     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1398 
1399     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1400     for (i=0; i<nrqr; ++i) {
1401       req_size[i] = 0;
1402       rbuf1_i        = rbuf1[i];
1403       start          = 2*rbuf1_i[0] + 1;
1404       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1405       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1406       sbuf2_i        = sbuf2[i];
1407       for (j=start; j<end; j++) {
1408         k            = rbuf1_i[j] - rstart;
1409         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1410         sbuf2_i[j]   = ncols;
1411         req_size[i] += ncols;
1412       }
1413       req_source1[i] = r_status1[i].MPI_SOURCE;
1414 
1415       /* form the header */
1416       sbuf2_i[0] = req_size[i];
1417       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1418 
1419       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1420     }
1421 
1422     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1423     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1424 
1425     /* rbuf2 is received, Post recv column indices a->j */
1426     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1427 
1428     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1429     for (i=0; i<nrqs; ++i) {
1430       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1431       req_source2[i] = r_status2[i].MPI_SOURCE;
1432       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1433     }
1434 
1435     /* Wait on sends1 and sends2 */
1436     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1437     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1438     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1439     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1440 
1441     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1442     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1443 
1444     /* Now allocate sending buffers for a->j, and send them off */
1445     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1446     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1447     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1448     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1449 
1450     for (i=0; i<nrqr; i++) { /* for each requested message */
1451       rbuf1_i   = rbuf1[i];
1452       sbuf_aj_i = sbuf_aj[i];
1453       ct1       = 2*rbuf1_i[0] + 1;
1454       ct2       = 0;
1455       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1456 
1457       kmax = rbuf1[i][2];
1458       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1459         row    = rbuf1_i[ct1] - rstart;
1460         nzA    = ai[row+1] - ai[row];
1461         nzB    = bi[row+1] - bi[row];
1462         ncols  = nzA + nzB;
1463         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1464 
1465         /* load the column indices for this row into cols*/
1466         cols = sbuf_aj_i + ct2;
1467 
1468         lwrite = 0;
1469         for (l=0; l<nzB; l++) {
1470           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1471         }
1472         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1473         for (l=0; l<nzB; l++) {
1474           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1475         }
1476 
1477         ct2 += ncols;
1478       }
1479       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1480     }
1481 
1482     /* create column map (cmap): global col of C -> local col of submat */
1483 #if defined(PETSC_USE_CTABLE)
1484     if (!allcolumns) {
1485       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1486       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1487       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1488         if (icol[j] >= cstart && icol[j] <cend) {
1489           cmap_loc[icol[j] - cstart] = j+1;
1490         } else { /* use PetscTable for non-local col indices */
1491           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1492         }
1493       }
1494     } else {
1495       cmap     = NULL;
1496       cmap_loc = NULL;
1497     }
1498     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1499 #else
1500     if (!allcolumns) {
1501       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1502       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1503     } else {
1504       cmap = NULL;
1505     }
1506 #endif
1507 
1508     /* Create lens for MatSeqAIJSetPreallocation() */
1509     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1510 
1511     /* Compute lens from local part of C */
1512     for (j=0; j<nrow; j++) {
1513       row  = irow[j];
1514       proc = row2proc[j];
1515       if (proc == rank) {
1516         /* diagonal part A = c->A */
1517         ncols = ai[row-rstart+1] - ai[row-rstart];
1518         cols  = aj + ai[row-rstart];
1519         if (!allcolumns) {
1520           for (k=0; k<ncols; k++) {
1521 #if defined(PETSC_USE_CTABLE)
1522             tcol = cmap_loc[cols[k]];
1523 #else
1524             tcol = cmap[cols[k]+cstart];
1525 #endif
1526             if (tcol) lens[j]++;
1527           }
1528         } else { /* allcolumns */
1529           lens[j] = ncols;
1530         }
1531 
1532         /* off-diagonal part B = c->B */
1533         ncols = bi[row-rstart+1] - bi[row-rstart];
1534         cols  = bj + bi[row-rstart];
1535         if (!allcolumns) {
1536           for (k=0; k<ncols; k++) {
1537 #if defined(PETSC_USE_CTABLE)
1538             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1539 #else
1540             tcol = cmap[bmap[cols[k]]];
1541 #endif
1542             if (tcol) lens[j]++;
1543           }
1544         } else { /* allcolumns */
1545           lens[j] += ncols;
1546         }
1547       }
1548     }
1549 
1550     /* Create row map (rmap): global row of C -> local row of submat */
1551 #if defined(PETSC_USE_CTABLE)
1552     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1553     for (j=0; j<nrow; j++) {
1554       row  = irow[j];
1555       proc = row2proc[j];
1556       if (proc == rank) { /* a local row */
1557         rmap_loc[row - rstart] = j;
1558       } else {
1559         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1560       }
1561     }
1562 #else
1563     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1564     for (j=0; j<nrow; j++) {
1565       rmap[irow[j]] = j;
1566     }
1567 #endif
1568 
1569     /* Update lens from offproc data */
1570     /* recv a->j is done */
1571     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1572     for (i=0; i<nrqs; i++) {
1573       proc    = pa[i];
1574       sbuf1_i = sbuf1[proc];
1575       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1576       ct1     = 2 + 1;
1577       ct2     = 0;
1578       rbuf2_i = rbuf2[i]; /* received length of C->j */
1579       rbuf3_i = rbuf3[i]; /* received C->j */
1580 
1581       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1582       max1   = sbuf1_i[2];
1583       for (k=0; k<max1; k++,ct1++) {
1584 #if defined(PETSC_USE_CTABLE)
1585         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1586         row--;
1587         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1588 #else
1589         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1590 #endif
1591         /* Now, store row index of submat in sbuf1_i[ct1] */
1592         sbuf1_i[ct1] = row;
1593 
1594         nnz = rbuf2_i[ct1];
1595         if (!allcolumns) {
1596           for (l=0; l<nnz; l++,ct2++) {
1597 #if defined(PETSC_USE_CTABLE)
1598             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1599               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1600             } else {
1601               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1602             }
1603 #else
1604             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1605 #endif
1606             if (tcol) lens[row]++;
1607           }
1608         } else { /* allcolumns */
1609           lens[row] += nnz;
1610         }
1611       }
1612     }
1613     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1614     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1615 
1616     /* Create the submatrices */
1617     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1618     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1619 
1620     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1621     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1622     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1623     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1624     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1625 
1626     /* create struct Mat_SubSppt and attached it to submat */
1627     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1628     subc = (Mat_SeqAIJ*)submat->data;
1629     subc->submatis1 = smatis1;
1630 
1631     smatis1->id          = 0;
1632     smatis1->nrqs        = nrqs;
1633     smatis1->nrqr        = nrqr;
1634     smatis1->rbuf1       = rbuf1;
1635     smatis1->rbuf2       = rbuf2;
1636     smatis1->rbuf3       = rbuf3;
1637     smatis1->sbuf2       = sbuf2;
1638     smatis1->req_source2 = req_source2;
1639 
1640     smatis1->sbuf1       = sbuf1;
1641     smatis1->ptr         = ptr;
1642     smatis1->tmp         = tmp;
1643     smatis1->ctr         = ctr;
1644 
1645     smatis1->pa           = pa;
1646     smatis1->req_size     = req_size;
1647     smatis1->req_source1  = req_source1;
1648 
1649     smatis1->allcolumns  = allcolumns;
1650     smatis1->singleis    = PETSC_TRUE;
1651     smatis1->row2proc    = row2proc;
1652     smatis1->rmap        = rmap;
1653     smatis1->cmap        = cmap;
1654 #if defined(PETSC_USE_CTABLE)
1655     smatis1->rmap_loc    = rmap_loc;
1656     smatis1->cmap_loc    = cmap_loc;
1657 #endif
1658 
1659     smatis1->destroy     = submat->ops->destroy;
1660     submat->ops->destroy = MatDestroy_SeqAIJ_Submatrices;
1661     submat->factortype   = C->factortype;
1662 
1663     /* compute rmax */
1664     rmax = 0;
1665     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1666 
1667   } else { /* scall == MAT_REUSE_MATRIX */
1668     submat = submats[0];
1669     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1670 
1671     subc    = (Mat_SeqAIJ*)submat->data;
1672     rmax    = subc->rmax;
1673     smatis1 = subc->submatis1;
1674     nrqs        = smatis1->nrqs;
1675     nrqr        = smatis1->nrqr;
1676     rbuf1       = smatis1->rbuf1;
1677     rbuf2       = smatis1->rbuf2;
1678     rbuf3       = smatis1->rbuf3;
1679     req_source2 = smatis1->req_source2;
1680 
1681     sbuf1     = smatis1->sbuf1;
1682     sbuf2     = smatis1->sbuf2;
1683     ptr       = smatis1->ptr;
1684     tmp       = smatis1->tmp;
1685     ctr       = smatis1->ctr;
1686 
1687     pa         = smatis1->pa;
1688     req_size   = smatis1->req_size;
1689     req_source1 = smatis1->req_source1;
1690 
1691     allcolumns = smatis1->allcolumns;
1692     row2proc   = smatis1->row2proc;
1693     rmap       = smatis1->rmap;
1694     cmap       = smatis1->cmap;
1695 #if defined(PETSC_USE_CTABLE)
1696     rmap_loc   = smatis1->rmap_loc;
1697     cmap_loc   = smatis1->cmap_loc;
1698 #endif
1699   }
1700 
1701   /* Post recv matrix values */
1702   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1703   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1704   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1705   for (i=0; i<nrqs; ++i) {
1706     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1707     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1708   }
1709 
1710   /* Allocate sending buffers for a->a, and send them off */
1711   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1712   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1713   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1714   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1715 
1716   for (i=0; i<nrqr; i++) {
1717     rbuf1_i   = rbuf1[i];
1718     sbuf_aa_i = sbuf_aa[i];
1719     ct1       = 2*rbuf1_i[0]+1;
1720     ct2       = 0;
1721     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1722 
1723     kmax = rbuf1_i[2];
1724     for (k=0; k<kmax; k++,ct1++) {
1725       row = rbuf1_i[ct1] - rstart;
1726       nzA = ai[row+1] - ai[row];
1727       nzB = bi[row+1] - bi[row];
1728       ncols  = nzA + nzB;
1729       cworkB = bj + bi[row];
1730       vworkA = a_a + ai[row];
1731       vworkB = b_a + bi[row];
1732 
1733       /* load the column values for this row into vals*/
1734       vals = sbuf_aa_i + ct2;
1735 
1736       lwrite = 0;
1737       for (l=0; l<nzB; l++) {
1738         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1739       }
1740       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1741       for (l=0; l<nzB; l++) {
1742         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1743       }
1744 
1745       ct2 += ncols;
1746     }
1747     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1748   }
1749 
1750   /* Assemble submat */
1751   /* First assemble the local rows */
1752   for (j=0; j<nrow; j++) {
1753     row  = irow[j];
1754     proc = row2proc[j];
1755     if (proc == rank) {
1756       Crow = row - rstart;  /* local row index of C */
1757 #if defined(PETSC_USE_CTABLE)
1758       row = rmap_loc[Crow]; /* row index of submat */
1759 #else
1760       row = rmap[row];
1761 #endif
1762 
1763       if (allcolumns) {
1764         /* diagonal part A = c->A */
1765         ncols = ai[Crow+1] - ai[Crow];
1766         cols  = aj + ai[Crow];
1767         vals  = a->a + ai[Crow];
1768         i     = 0;
1769         for (k=0; k<ncols; k++) {
1770           subcols[i]   = cols[k] + cstart;
1771           subvals[i++] = vals[k];
1772         }
1773 
1774         /* off-diagonal part B = c->B */
1775         ncols = bi[Crow+1] - bi[Crow];
1776         cols  = bj + bi[Crow];
1777         vals  = b->a + bi[Crow];
1778         for (k=0; k<ncols; k++) {
1779           subcols[i]   = bmap[cols[k]];
1780           subvals[i++] = vals[k];
1781         }
1782 
1783         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1784 
1785       } else { /* !allcolumns */
1786 #if defined(PETSC_USE_CTABLE)
1787         /* diagonal part A = c->A */
1788         ncols = ai[Crow+1] - ai[Crow];
1789         cols  = aj + ai[Crow];
1790         vals  = a->a + ai[Crow];
1791         i     = 0;
1792         for (k=0; k<ncols; k++) {
1793           tcol = cmap_loc[cols[k]];
1794           if (tcol) {
1795             subcols[i]   = --tcol;
1796             subvals[i++] = vals[k];
1797           }
1798         }
1799 
1800         /* off-diagonal part B = c->B */
1801         ncols = bi[Crow+1] - bi[Crow];
1802         cols  = bj + bi[Crow];
1803         vals  = b->a + bi[Crow];
1804         for (k=0; k<ncols; k++) {
1805           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1806           if (tcol) {
1807             subcols[i]   = --tcol;
1808             subvals[i++] = vals[k];
1809           }
1810         }
1811 #else
1812         /* diagonal part A = c->A */
1813         ncols = ai[Crow+1] - ai[Crow];
1814         cols  = aj + ai[Crow];
1815         vals  = a->a + ai[Crow];
1816         i     = 0;
1817         for (k=0; k<ncols; k++) {
1818           tcol = cmap[cols[k]+cstart];
1819           if (tcol) {
1820             subcols[i]   = --tcol;
1821             subvals[i++] = vals[k];
1822           }
1823         }
1824 
1825         /* off-diagonal part B = c->B */
1826         ncols = bi[Crow+1] - bi[Crow];
1827         cols  = bj + bi[Crow];
1828         vals  = b->a + bi[Crow];
1829         for (k=0; k<ncols; k++) {
1830           tcol = cmap[bmap[cols[k]]];
1831           if (tcol) {
1832             subcols[i]   = --tcol;
1833             subvals[i++] = vals[k];
1834           }
1835         }
1836 #endif
1837         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1838       }
1839     }
1840   }
1841 
1842   /* Now assemble the off-proc rows */
1843   for (i=0; i<nrqs; i++) { /* for each requested message */
1844     /* recv values from other processes */
1845     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1846     proc    = pa[idex];
1847     sbuf1_i = sbuf1[proc];
1848     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1849     ct1     = 2 + 1;
1850     ct2     = 0; /* count of received C->j */
1851     ct3     = 0; /* count of received C->j that will be inserted into submat */
1852     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1853     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1854     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1855 
1856     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1857     max1 = sbuf1_i[2];             /* num of rows */
1858     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1859       row = sbuf1_i[ct1]; /* row index of submat */
1860       if (!allcolumns) {
1861         idex = 0;
1862         if (scall == MAT_INITIAL_MATRIX) {
1863           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1864           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1865 #if defined(PETSC_USE_CTABLE)
1866             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1867               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1868             } else {
1869               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1870             }
1871 #else
1872             tcol = cmap[rbuf3_i[ct2]];
1873 #endif
1874             if (tcol) {
1875               subcols[idex]   = --tcol;
1876               subvals[idex++] = rbuf4_i[ct2];
1877 
1878               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1879                For reuse, we replace received C->j with index that should be inserted to submat */
1880               rbuf3_i[ct3++] = ct2;
1881             }
1882           }
1883           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1884 
1885         } else { /* scall == MAT_REUSE_MATRIX */
1886           submat = submats[0];
1887           subc   = (Mat_SeqAIJ*)submat->data;
1888 
1889           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1890           for (l=0; l<nnz; l++) {
1891             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1892             subvals[idex++] = rbuf4_i[ct2];
1893           }
1894 
1895           bj = subc->j + subc->i[row];
1896           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1897         }
1898       } else { /* allcolumns */
1899         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1900         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1901         ct2 += nnz;
1902       }
1903     }
1904   }
1905 
1906   /* sending a->a are done */
1907   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1908   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1909 
1910   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1911   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1912   submats[0] = submat;
1913 
1914   /* Restore the indices */
1915   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1916   if (!allcolumns) {
1917     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1918   }
1919 
1920   /* Destroy allocated memory */
1921   for (i=0; i<nrqs; ++i) {
1922     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1923   }
1924   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1925   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1926   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1927 
1928   if (scall == MAT_INITIAL_MATRIX) {
1929     ierr = PetscFree(lens);CHKERRQ(ierr);
1930     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1931     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1932   }
1933   PetscFunctionReturn(0);
1934 }
1935 
1936 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1937 {
1938   PetscErrorCode ierr;
1939   PetscInt       ncol;
1940   PetscBool      colflag,allcolumns=PETSC_FALSE;
1941 
1942   PetscFunctionBegin;
1943   /* Allocate memory to hold all the submatrices */
1944   if (scall == MAT_INITIAL_MATRIX) {
1945     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1946   }
1947 
1948   /* Check for special case: each processor gets entire matrix columns */
1949   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1950   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1951   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1952 
1953   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1954   PetscFunctionReturn(0);
1955 }
1956 
1957 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1958 {
1959   PetscErrorCode ierr;
1960   PetscInt       nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2];
1961   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1962   Mat_SeqAIJ     *subc;
1963   Mat_SubSppt    *smat;
1964 
1965   PetscFunctionBegin;
1966   /* Check for special case: each processor has a single IS */
1967   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1968     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1969     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1970     PetscFunctionReturn(0);
1971   }
1972 
1973   /* Collect global wantallmatrix and nstages */
1974   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1975   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1976   if (!nmax) nmax = 1;
1977 
1978   if (scall == MAT_INITIAL_MATRIX) {
1979     /* Collect global wantallmatrix and nstages */
1980     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1981       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1982       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1983       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1984       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1985       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1986         wantallmatrix = PETSC_TRUE;
1987 
1988         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1989       }
1990     }
1991 
1992     /* Determine the number of stages through which submatrices are done
1993        Each stage will extract nmax submatrices.
1994        nmax is determined by the matrix column dimension.
1995        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1996     */
1997     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1998 
1999     in[0] = -1*(PetscInt)wantallmatrix;
2000     in[1] = nstages;
2001     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2002     wantallmatrix = (PetscBool)(-out[0]);
2003     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2004 
2005   } else { /* MAT_REUSE_MATRIX */
2006     if (ismax) {
2007       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2008       smat = subc->submatis1;
2009     } else { /* (*submat)[0] is a dummy matrix */
2010       smat = (Mat_SubSppt*)(*submat)[0]->data;
2011     }
2012     if (!smat) {
2013       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2014       wantallmatrix = PETSC_TRUE;
2015     } else if (smat->singleis) {
2016       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2017       PetscFunctionReturn(0);
2018     } else {
2019       nstages = smat->nstages;
2020     }
2021   }
2022 
2023   if (wantallmatrix) {
2024     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2025     PetscFunctionReturn(0);
2026   }
2027 
2028   /* Allocate memory to hold all the submatrices and dummy submatrices */
2029   if (scall == MAT_INITIAL_MATRIX) {
2030     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2031   }
2032 
2033   for (i=0,pos=0; i<nstages; i++) {
2034     if (pos+nmax <= ismax) max_no = nmax;
2035     else if (pos == ismax) max_no = 0;
2036     else                   max_no = ismax-pos;
2037 
2038     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2039     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2040       smat = (Mat_SubSppt*)(*submat)[pos]->data;
2041       smat->nstages = nstages;
2042     }
2043     pos += max_no;
2044   }
2045 
2046   if (ismax && scall == MAT_INITIAL_MATRIX) {
2047     /* save nstages for reuse */
2048     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2049     smat = subc->submatis1;
2050     smat->nstages = nstages;
2051   }
2052   PetscFunctionReturn(0);
2053 }
2054 
2055 /* -------------------------------------------------------------------------*/
2056 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2057 {
2058   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2059   Mat            A  = c->A;
2060   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2061   const PetscInt **icol,**irow;
2062   PetscInt       *nrow,*ncol,start;
2063   PetscErrorCode ierr;
2064   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2065   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2066   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2067   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2068   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2069 #if defined(PETSC_USE_CTABLE)
2070   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2071 #else
2072   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2073 #endif
2074   const PetscInt *irow_i;
2075   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2076   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2077   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2078   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2079   MPI_Status     *r_status3,*r_status4,*s_status4;
2080   MPI_Comm       comm;
2081   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2082   PetscMPIInt    *onodes1,*olengths1,end;
2083   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2084   Mat_SubSppt    *smat_i;
2085   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2086   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2087 
2088   PetscFunctionBegin;
2089   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2090   size = c->size;
2091   rank = c->rank;
2092 
2093   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2094   ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2095 
2096   for (i=0; i<ismax; i++) {
2097     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2098     if (!issorted[i]) iscsorted = issorted[i];
2099 
2100     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2101 
2102     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2103     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2104 
2105     /* Check for special case: allcolumn */
2106     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2107     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2108     if (colflag && ncol[i] == C->cmap->N) {
2109       allcolumns[i] = PETSC_TRUE;
2110       icol[i] = NULL;
2111     } else {
2112       allcolumns[i] = PETSC_FALSE;
2113       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2114     }
2115   }
2116 
2117   if (scall == MAT_REUSE_MATRIX) {
2118     /* Assumes new rows are same length as the old rows */
2119     for (i=0; i<ismax; i++) {
2120       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2121       subc = (Mat_SeqAIJ*)submats[i]->data;
2122       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2123 
2124       /* Initial matrix as if empty */
2125       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2126 
2127       smat_i   = subc->submatis1;
2128 
2129       nrqs        = smat_i->nrqs;
2130       nrqr        = smat_i->nrqr;
2131       rbuf1       = smat_i->rbuf1;
2132       rbuf2       = smat_i->rbuf2;
2133       rbuf3       = smat_i->rbuf3;
2134       req_source2 = smat_i->req_source2;
2135 
2136       sbuf1     = smat_i->sbuf1;
2137       sbuf2     = smat_i->sbuf2;
2138       ptr       = smat_i->ptr;
2139       tmp       = smat_i->tmp;
2140       ctr       = smat_i->ctr;
2141 
2142       pa          = smat_i->pa;
2143       req_size    = smat_i->req_size;
2144       req_source1 = smat_i->req_source1;
2145 
2146       allcolumns[i] = smat_i->allcolumns;
2147       row2proc[i]   = smat_i->row2proc;
2148       rmap[i]       = smat_i->rmap;
2149       cmap[i]       = smat_i->cmap;
2150     }
2151 
2152     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2153       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2154       smat_i = (Mat_SubSppt*)submats[0]->data;
2155 
2156       nrqs        = smat_i->nrqs;
2157       nrqr        = smat_i->nrqr;
2158       rbuf1       = smat_i->rbuf1;
2159       rbuf2       = smat_i->rbuf2;
2160       rbuf3       = smat_i->rbuf3;
2161       req_source2 = smat_i->req_source2;
2162 
2163       sbuf1       = smat_i->sbuf1;
2164       sbuf2       = smat_i->sbuf2;
2165       ptr         = smat_i->ptr;
2166       tmp         = smat_i->tmp;
2167       ctr         = smat_i->ctr;
2168 
2169       pa          = smat_i->pa;
2170       req_size    = smat_i->req_size;
2171       req_source1 = smat_i->req_source1;
2172 
2173       allcolumns[0] = PETSC_FALSE;
2174     }
2175   } else { /* scall == MAT_INITIAL_MATRIX */
2176     /* Get some new tags to keep the communication clean */
2177     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2178     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2179 
2180     /* evaluate communication - mesg to who, length of mesg, and buffer space
2181      required. Based on this, buffers are allocated, and data copied into them*/
2182     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2183 
2184     for (i=0; i<ismax; i++) {
2185       jmax   = nrow[i];
2186       irow_i = irow[i];
2187 
2188       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2189       row2proc[i] = row2proc_i;
2190 
2191       if (issorted[i]) proc = 0;
2192       for (j=0; j<jmax; j++) {
2193         if (!issorted[i]) proc = 0;
2194         row = irow_i[j];
2195         while (row >= C->rmap->range[proc+1]) proc++;
2196         w4[proc]++;
2197         row2proc_i[j] = proc; /* map row index to proc */
2198       }
2199       for (j=0; j<size; j++) {
2200         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2201       }
2202     }
2203 
2204     nrqs     = 0;              /* no of outgoing messages */
2205     msz      = 0;              /* total mesg length (for all procs) */
2206     w1[rank] = 0;              /* no mesg sent to self */
2207     w3[rank] = 0;
2208     for (i=0; i<size; i++) {
2209       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2210     }
2211     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2212     for (i=0,j=0; i<size; i++) {
2213       if (w1[i]) { pa[j] = i; j++; }
2214     }
2215 
2216     /* Each message would have a header = 1 + 2*(no of IS) + data */
2217     for (i=0; i<nrqs; i++) {
2218       j      = pa[i];
2219       w1[j] += w2[j] + 2* w3[j];
2220       msz   += w1[j];
2221     }
2222     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2223 
2224     /* Determine the number of messages to expect, their lengths, from from-ids */
2225     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2226     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2227 
2228     /* Now post the Irecvs corresponding to these messages */
2229     tag0 = ((PetscObject)C)->tag;
2230     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2231 
2232     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2233     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2234 
2235     /* Allocate Memory for outgoing messages */
2236     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2237     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2238     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2239 
2240     {
2241       PetscInt *iptr = tmp;
2242       k    = 0;
2243       for (i=0; i<nrqs; i++) {
2244         j        = pa[i];
2245         iptr    += k;
2246         sbuf1[j] = iptr;
2247         k        = w1[j];
2248       }
2249     }
2250 
2251     /* Form the outgoing messages. Initialize the header space */
2252     for (i=0; i<nrqs; i++) {
2253       j           = pa[i];
2254       sbuf1[j][0] = 0;
2255       ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2256       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2257     }
2258 
2259     /* Parse the isrow and copy data into outbuf */
2260     for (i=0; i<ismax; i++) {
2261       row2proc_i = row2proc[i];
2262       ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2263       irow_i = irow[i];
2264       jmax   = nrow[i];
2265       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2266         proc = row2proc_i[j];
2267         if (proc != rank) { /* copy to the outgoing buf*/
2268           ctr[proc]++;
2269           *ptr[proc] = irow_i[j];
2270           ptr[proc]++;
2271         }
2272       }
2273       /* Update the headers for the current IS */
2274       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2275         if ((ctr_j = ctr[j])) {
2276           sbuf1_j        = sbuf1[j];
2277           k              = ++sbuf1_j[0];
2278           sbuf1_j[2*k]   = ctr_j;
2279           sbuf1_j[2*k-1] = i;
2280         }
2281       }
2282     }
2283 
2284     /*  Now  post the sends */
2285     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2286     for (i=0; i<nrqs; ++i) {
2287       j    = pa[i];
2288       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2289     }
2290 
2291     /* Post Receives to capture the buffer size */
2292     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2293     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2294     rbuf2[0] = tmp + msz;
2295     for (i=1; i<nrqs; ++i) {
2296       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2297     }
2298     for (i=0; i<nrqs; ++i) {
2299       j    = pa[i];
2300       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2301     }
2302 
2303     /* Send to other procs the buf size they should allocate */
2304     /* Receive messages*/
2305     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2306     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2307     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2308     {
2309       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2310       PetscInt   *sbuf2_i;
2311 
2312       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2313       for (i=0; i<nrqr; ++i) {
2314         req_size[i] = 0;
2315         rbuf1_i        = rbuf1[i];
2316         start          = 2*rbuf1_i[0] + 1;
2317         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2318         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2319         sbuf2_i        = sbuf2[i];
2320         for (j=start; j<end; j++) {
2321           id              = rbuf1_i[j] - rstart;
2322           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2323           sbuf2_i[j]      = ncols;
2324           req_size[i] += ncols;
2325         }
2326         req_source1[i] = r_status1[i].MPI_SOURCE;
2327         /* form the header */
2328         sbuf2_i[0] = req_size[i];
2329         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2330 
2331         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2332       }
2333     }
2334     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2335     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2336     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2337 
2338     /* Receive messages*/
2339     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2340     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2341 
2342     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2343     for (i=0; i<nrqs; ++i) {
2344       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2345       req_source2[i] = r_status2[i].MPI_SOURCE;
2346       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2347     }
2348     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2349     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2350 
2351     /* Wait on sends1 and sends2 */
2352     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2353     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2354 
2355     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2356     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2357     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2358     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2359     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2360     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2361 
2362     /* Now allocate sending buffers for a->j, and send them off */
2363     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2364     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2365     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2366     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2367 
2368     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2369     {
2370       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2371       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2372       PetscInt cend = C->cmap->rend;
2373       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2374 
2375       for (i=0; i<nrqr; i++) {
2376         rbuf1_i   = rbuf1[i];
2377         sbuf_aj_i = sbuf_aj[i];
2378         ct1       = 2*rbuf1_i[0] + 1;
2379         ct2       = 0;
2380         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2381           kmax = rbuf1[i][2*j];
2382           for (k=0; k<kmax; k++,ct1++) {
2383             row    = rbuf1_i[ct1] - rstart;
2384             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2385             ncols  = nzA + nzB;
2386             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2387 
2388             /* load the column indices for this row into cols */
2389             cols = sbuf_aj_i + ct2;
2390 
2391             lwrite = 0;
2392             for (l=0; l<nzB; l++) {
2393               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2394             }
2395             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2396             for (l=0; l<nzB; l++) {
2397               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2398             }
2399 
2400             ct2 += ncols;
2401           }
2402         }
2403         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2404       }
2405     }
2406     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2407 
2408     /* create col map: global col of C -> local col of submatrices */
2409     {
2410       const PetscInt *icol_i;
2411 #if defined(PETSC_USE_CTABLE)
2412       for (i=0; i<ismax; i++) {
2413         if (!allcolumns[i]) {
2414           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2415 
2416           jmax   = ncol[i];
2417           icol_i = icol[i];
2418           cmap_i = cmap[i];
2419           for (j=0; j<jmax; j++) {
2420             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2421           }
2422         } else cmap[i] = NULL;
2423       }
2424 #else
2425       for (i=0; i<ismax; i++) {
2426         if (!allcolumns[i]) {
2427           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2428           jmax   = ncol[i];
2429           icol_i = icol[i];
2430           cmap_i = cmap[i];
2431           for (j=0; j<jmax; j++) {
2432             cmap_i[icol_i[j]] = j+1;
2433           }
2434         } else cmap[i] = NULL;
2435       }
2436 #endif
2437     }
2438 
2439     /* Create lens which is required for MatCreate... */
2440     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2441     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2442 
2443     if (ismax) {
2444       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2445     }
2446     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2447 
2448     /* Update lens from local data */
2449     for (i=0; i<ismax; i++) {
2450       row2proc_i = row2proc[i];
2451       jmax = nrow[i];
2452       if (!allcolumns[i]) cmap_i = cmap[i];
2453       irow_i = irow[i];
2454       lens_i = lens[i];
2455       for (j=0; j<jmax; j++) {
2456         row = irow_i[j];
2457         proc = row2proc_i[j];
2458         if (proc == rank) {
2459           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2460           if (!allcolumns[i]) {
2461             for (k=0; k<ncols; k++) {
2462 #if defined(PETSC_USE_CTABLE)
2463               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2464 #else
2465               tcol = cmap_i[cols[k]];
2466 #endif
2467               if (tcol) lens_i[j]++;
2468             }
2469           } else { /* allcolumns */
2470             lens_i[j] = ncols;
2471           }
2472           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2473         }
2474       }
2475     }
2476 
2477     /* Create row map: global row of C -> local row of submatrices */
2478 #if defined(PETSC_USE_CTABLE)
2479     for (i=0; i<ismax; i++) {
2480       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2481       irow_i = irow[i];
2482       jmax   = nrow[i];
2483       for (j=0; j<jmax; j++) {
2484       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2485       }
2486     }
2487 #else
2488     for (i=0; i<ismax; i++) {
2489       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2490       rmap_i = rmap[i];
2491       irow_i = irow[i];
2492       jmax   = nrow[i];
2493       for (j=0; j<jmax; j++) {
2494         rmap_i[irow_i[j]] = j;
2495       }
2496     }
2497 #endif
2498 
2499     /* Update lens from offproc data */
2500     {
2501       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2502 
2503       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2504       for (tmp2=0; tmp2<nrqs; tmp2++) {
2505         sbuf1_i = sbuf1[pa[tmp2]];
2506         jmax    = sbuf1_i[0];
2507         ct1     = 2*jmax+1;
2508         ct2     = 0;
2509         rbuf2_i = rbuf2[tmp2];
2510         rbuf3_i = rbuf3[tmp2];
2511         for (j=1; j<=jmax; j++) {
2512           is_no  = sbuf1_i[2*j-1];
2513           max1   = sbuf1_i[2*j];
2514           lens_i = lens[is_no];
2515           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2516           rmap_i = rmap[is_no];
2517           for (k=0; k<max1; k++,ct1++) {
2518 #if defined(PETSC_USE_CTABLE)
2519             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2520             row--;
2521             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2522 #else
2523             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2524 #endif
2525             max2 = rbuf2_i[ct1];
2526             for (l=0; l<max2; l++,ct2++) {
2527               if (!allcolumns[is_no]) {
2528 #if defined(PETSC_USE_CTABLE)
2529                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2530 #else
2531                 tcol = cmap_i[rbuf3_i[ct2]];
2532 #endif
2533                 if (tcol) lens_i[row]++;
2534               } else { /* allcolumns */
2535                 lens_i[row]++; /* lens_i[row] += max2 ? */
2536               }
2537             }
2538           }
2539         }
2540       }
2541     }
2542     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2543     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2544     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2545     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2546 
2547     /* Create the submatrices */
2548     for (i=0; i<ismax; i++) {
2549       PetscInt    rbs,cbs;
2550 
2551       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2552       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2553 
2554       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2555       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2556 
2557       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2558       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2559       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2560 
2561       /* create struct Mat_SubSppt and attached it to submat */
2562       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2563       subc = (Mat_SeqAIJ*)submats[i]->data;
2564       subc->submatis1 = smat_i;
2565 
2566       smat_i->destroy          = submats[i]->ops->destroy;
2567       submats[i]->ops->destroy = MatDestroy_SeqAIJ_Submatrices;
2568       submats[i]->factortype   = C->factortype;
2569 
2570       smat_i->id          = i;
2571       smat_i->nrqs        = nrqs;
2572       smat_i->nrqr        = nrqr;
2573       smat_i->rbuf1       = rbuf1;
2574       smat_i->rbuf2       = rbuf2;
2575       smat_i->rbuf3       = rbuf3;
2576       smat_i->sbuf2       = sbuf2;
2577       smat_i->req_source2 = req_source2;
2578 
2579       smat_i->sbuf1       = sbuf1;
2580       smat_i->ptr         = ptr;
2581       smat_i->tmp         = tmp;
2582       smat_i->ctr         = ctr;
2583 
2584       smat_i->pa           = pa;
2585       smat_i->req_size     = req_size;
2586       smat_i->req_source1  = req_source1;
2587 
2588       smat_i->allcolumns  = allcolumns[i];
2589       smat_i->singleis    = PETSC_FALSE;
2590       smat_i->row2proc    = row2proc[i];
2591       smat_i->rmap        = rmap[i];
2592       smat_i->cmap        = cmap[i];
2593     }
2594 
2595     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2596       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2597       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2598       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2599 
2600       /* create struct Mat_SubSppt and attached it to submat */
2601       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2602       submats[0]->data = (void*)smat_i;
2603 
2604       smat_i->destroy          = submats[0]->ops->destroy;
2605       submats[0]->ops->destroy = MatDestroy_Dummy_Submatrices;
2606       submats[0]->factortype   = C->factortype;
2607 
2608       smat_i->id          = 0;
2609       smat_i->nrqs        = nrqs;
2610       smat_i->nrqr        = nrqr;
2611       smat_i->rbuf1       = rbuf1;
2612       smat_i->rbuf2       = rbuf2;
2613       smat_i->rbuf3       = rbuf3;
2614       smat_i->sbuf2       = sbuf2;
2615       smat_i->req_source2 = req_source2;
2616 
2617       smat_i->sbuf1       = sbuf1;
2618       smat_i->ptr         = ptr;
2619       smat_i->tmp         = tmp;
2620       smat_i->ctr         = ctr;
2621 
2622       smat_i->pa           = pa;
2623       smat_i->req_size     = req_size;
2624       smat_i->req_source1  = req_source1;
2625 
2626       smat_i->allcolumns  = PETSC_FALSE;
2627       smat_i->singleis    = PETSC_FALSE;
2628       smat_i->row2proc    = NULL;
2629       smat_i->rmap        = NULL;
2630       smat_i->cmap        = NULL;
2631     }
2632 
2633     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2634     ierr = PetscFree(lens);CHKERRQ(ierr);
2635     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2636     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2637 
2638   } /* endof scall == MAT_INITIAL_MATRIX */
2639 
2640   /* Post recv matrix values */
2641   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2642   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2643   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2644   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2645   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2646   for (i=0; i<nrqs; ++i) {
2647     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2648     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2649   }
2650 
2651   /* Allocate sending buffers for a->a, and send them off */
2652   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2653   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2654   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2655   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2656 
2657   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2658   {
2659     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2660     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2661     PetscInt    cend   = C->cmap->rend;
2662     PetscInt    *b_j   = b->j;
2663     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2664 
2665     for (i=0; i<nrqr; i++) {
2666       rbuf1_i   = rbuf1[i];
2667       sbuf_aa_i = sbuf_aa[i];
2668       ct1       = 2*rbuf1_i[0]+1;
2669       ct2       = 0;
2670       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2671         kmax = rbuf1_i[2*j];
2672         for (k=0; k<kmax; k++,ct1++) {
2673           row    = rbuf1_i[ct1] - rstart;
2674           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2675           ncols  = nzA + nzB;
2676           cworkB = b_j + b_i[row];
2677           vworkA = a_a + a_i[row];
2678           vworkB = b_a + b_i[row];
2679 
2680           /* load the column values for this row into vals*/
2681           vals = sbuf_aa_i+ct2;
2682 
2683           lwrite = 0;
2684           for (l=0; l<nzB; l++) {
2685             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2686           }
2687           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2688           for (l=0; l<nzB; l++) {
2689             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2690           }
2691 
2692           ct2 += ncols;
2693         }
2694       }
2695       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2696     }
2697   }
2698 
2699   /* Assemble the matrices */
2700   /* First assemble the local rows */
2701   for (i=0; i<ismax; i++) {
2702     row2proc_i = row2proc[i];
2703     subc      = (Mat_SeqAIJ*)submats[i]->data;
2704     imat_ilen = subc->ilen;
2705     imat_j    = subc->j;
2706     imat_i    = subc->i;
2707     imat_a    = subc->a;
2708 
2709     if (!allcolumns[i]) cmap_i = cmap[i];
2710     rmap_i = rmap[i];
2711     irow_i = irow[i];
2712     jmax   = nrow[i];
2713     for (j=0; j<jmax; j++) {
2714       row  = irow_i[j];
2715       proc = row2proc_i[j];
2716       if (proc == rank) {
2717         old_row = row;
2718 #if defined(PETSC_USE_CTABLE)
2719         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2720         row--;
2721 #else
2722         row = rmap_i[row];
2723 #endif
2724         ilen_row = imat_ilen[row];
2725         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2726         mat_i    = imat_i[row];
2727         mat_a    = imat_a + mat_i;
2728         mat_j    = imat_j + mat_i;
2729         if (!allcolumns[i]) {
2730           for (k=0; k<ncols; k++) {
2731 #if defined(PETSC_USE_CTABLE)
2732             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2733 #else
2734             tcol = cmap_i[cols[k]];
2735 #endif
2736             if (tcol) {
2737               *mat_j++ = tcol - 1;
2738               *mat_a++ = vals[k];
2739               ilen_row++;
2740             }
2741           }
2742         } else { /* allcolumns */
2743           for (k=0; k<ncols; k++) {
2744             *mat_j++ = cols[k];  /* global col index! */
2745             *mat_a++ = vals[k];
2746             ilen_row++;
2747           }
2748         }
2749         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2750 
2751         imat_ilen[row] = ilen_row;
2752       }
2753     }
2754   }
2755 
2756   /* Now assemble the off proc rows */
2757   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2758   for (tmp2=0; tmp2<nrqs; tmp2++) {
2759     sbuf1_i = sbuf1[pa[tmp2]];
2760     jmax    = sbuf1_i[0];
2761     ct1     = 2*jmax + 1;
2762     ct2     = 0;
2763     rbuf2_i = rbuf2[tmp2];
2764     rbuf3_i = rbuf3[tmp2];
2765     rbuf4_i = rbuf4[tmp2];
2766     for (j=1; j<=jmax; j++) {
2767       is_no     = sbuf1_i[2*j-1];
2768       rmap_i    = rmap[is_no];
2769       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2770       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2771       imat_ilen = subc->ilen;
2772       imat_j    = subc->j;
2773       imat_i    = subc->i;
2774       imat_a    = subc->a;
2775       max1      = sbuf1_i[2*j];
2776       for (k=0; k<max1; k++,ct1++) {
2777         row = sbuf1_i[ct1];
2778 #if defined(PETSC_USE_CTABLE)
2779         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2780         row--;
2781 #else
2782         row = rmap_i[row];
2783 #endif
2784         ilen  = imat_ilen[row];
2785         mat_i = imat_i[row];
2786         mat_a = imat_a + mat_i;
2787         mat_j = imat_j + mat_i;
2788         max2  = rbuf2_i[ct1];
2789         if (!allcolumns[is_no]) {
2790           for (l=0; l<max2; l++,ct2++) {
2791 #if defined(PETSC_USE_CTABLE)
2792             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2793 #else
2794             tcol = cmap_i[rbuf3_i[ct2]];
2795 #endif
2796             if (tcol) {
2797               *mat_j++ = tcol - 1;
2798               *mat_a++ = rbuf4_i[ct2];
2799               ilen++;
2800             }
2801           }
2802         } else { /* allcolumns */
2803           for (l=0; l<max2; l++,ct2++) {
2804             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2805             *mat_a++ = rbuf4_i[ct2];
2806             ilen++;
2807           }
2808         }
2809         imat_ilen[row] = ilen;
2810       }
2811     }
2812   }
2813 
2814   if (!iscsorted) { /* sort column indices of the rows */
2815     for (i=0; i<ismax; i++) {
2816       subc      = (Mat_SeqAIJ*)submats[i]->data;
2817       imat_j    = subc->j;
2818       imat_i    = subc->i;
2819       imat_a    = subc->a;
2820       imat_ilen = subc->ilen;
2821 
2822       if (allcolumns[i]) continue;
2823       jmax = nrow[i];
2824       for (j=0; j<jmax; j++) {
2825         mat_i = imat_i[j];
2826         mat_a = imat_a + mat_i;
2827         mat_j = imat_j + mat_i;
2828         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2829       }
2830     }
2831   }
2832 
2833   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2834   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2835   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2836   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2837   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2838 
2839   /* Restore the indices */
2840   for (i=0; i<ismax; i++) {
2841     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2842     if (!allcolumns[i]) {
2843       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2844     }
2845   }
2846 
2847   for (i=0; i<ismax; i++) {
2848     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2849     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2850   }
2851 
2852   /* Destroy allocated memory */
2853   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2854   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2855   ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr);
2856 
2857   for (i=0; i<nrqs; ++i) {
2858     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2859   }
2860   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2861 
2862   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2863   PetscFunctionReturn(0);
2864 }
2865 
2866 /*
2867  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2868  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2869  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2870  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2871  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2872  state, and needs to be "assembled" later by compressing B's column space.
2873 
2874  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2875  Following this call, C->A & C->B have been created, even if empty.
2876  */
2877 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2878 {
2879   /* If making this function public, change the error returned in this function away from _PLIB. */
2880   PetscErrorCode ierr;
2881   Mat_MPIAIJ     *aij;
2882   Mat_SeqAIJ     *Baij;
2883   PetscBool      seqaij,Bdisassembled;
2884   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2885   PetscScalar    v;
2886   const PetscInt *rowindices,*colindices;
2887 
2888   PetscFunctionBegin;
2889   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2890   if (A) {
2891     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2892     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2893     if (rowemb) {
2894       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2895       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2896     } else {
2897       if (C->rmap->n != A->rmap->n) {
2898 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2899       }
2900     }
2901     if (dcolemb) {
2902       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2903       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2904     } else {
2905       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2906     }
2907   }
2908   if (B) {
2909     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2910     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2911     if (rowemb) {
2912       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2913       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2914     } else {
2915       if (C->rmap->n != B->rmap->n) {
2916 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2917       }
2918     }
2919     if (ocolemb) {
2920       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2921       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2922     } else {
2923       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2924     }
2925   }
2926 
2927   aij = (Mat_MPIAIJ*)C->data;
2928   if (!aij->A) {
2929     /* Mimic parts of MatMPIAIJSetPreallocation() */
2930     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2931     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2932     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2933     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2934     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2935   }
2936   if (A) {
2937     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2938   } else {
2939     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2940   }
2941   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2942     /*
2943       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2944       need to "disassemble" B -- convert it to using C's global indices.
2945       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2946 
2947       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2948       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2949 
2950       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2951       At least avoid calling MatSetValues() and the implied searches?
2952     */
2953 
2954     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2955 #if defined(PETSC_USE_CTABLE)
2956       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2957 #else
2958       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2959       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2960       if (aij->B) {
2961         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2962       }
2963 #endif
2964       ngcol = 0;
2965       if (aij->lvec) {
2966 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2967       }
2968       if (aij->garray) {
2969 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2970 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2971       }
2972       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2973       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2974     }
2975     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2976       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2977     }
2978     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2979       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2980     }
2981   }
2982   Bdisassembled = PETSC_FALSE;
2983   if (!aij->B) {
2984     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2985     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2986     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2987     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2988     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2989     Bdisassembled = PETSC_TRUE;
2990   }
2991   if (B) {
2992     Baij = (Mat_SeqAIJ*)B->data;
2993     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2994       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2995       for (i=0; i<B->rmap->n; i++) {
2996 	nz[i] = Baij->i[i+1] - Baij->i[i];
2997       }
2998       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2999       ierr = PetscFree(nz);CHKERRQ(ierr);
3000     }
3001 
3002     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3003     shift = rend-rstart;
3004     count = 0;
3005     rowindices = NULL;
3006     colindices = NULL;
3007     if (rowemb) {
3008       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3009     }
3010     if (ocolemb) {
3011       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3012     }
3013     for (i=0; i<B->rmap->n; i++) {
3014       PetscInt row;
3015       row = i;
3016       if (rowindices) row = rowindices[i];
3017       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3018 	col  = Baij->j[count];
3019 	if (colindices) col = colindices[col];
3020 	if (Bdisassembled && col>=rstart) col += shift;
3021 	v    = Baij->a[count];
3022 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3023 	++count;
3024       }
3025     }
3026     /* No assembly for aij->B is necessary. */
3027     /* FIXME: set aij->B's nonzerostate correctly. */
3028   } else {
3029     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3030   }
3031   C->preallocated  = PETSC_TRUE;
3032   C->was_assembled = PETSC_FALSE;
3033   C->assembled     = PETSC_FALSE;
3034    /*
3035       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3036       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3037    */
3038   PetscFunctionReturn(0);
3039 }
3040 
3041 /*
3042   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3043  */
3044 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3045 {
3046   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3047 
3048   PetscFunctionBegin;
3049   PetscValidPointer(A,2);
3050   PetscValidPointer(B,3);
3051   /* FIXME: make sure C is assembled */
3052   *A = aij->A;
3053   *B = aij->B;
3054   /* Note that we don't incref *A and *B, so be careful! */
3055   PetscFunctionReturn(0);
3056 }
3057 
3058 /*
3059   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3060   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3061 */
3062 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3063                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3064 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3065 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3066 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3067 {
3068   PetscErrorCode ierr;
3069   PetscMPIInt    isize,flag;
3070   PetscInt       i,ii,cismax,ispar;
3071   Mat            *A,*B;
3072   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3073 
3074   PetscFunctionBegin;
3075   if (!ismax) PetscFunctionReturn(0);
3076 
3077   for (i = 0, cismax = 0; i < ismax; ++i) {
3078     PetscMPIInt isize;
3079     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3080     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3081     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3082     if (isize > 1) ++cismax;
3083   }
3084 
3085   /*
3086      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3087      ispar counts the number of parallel ISs across C's comm.
3088   */
3089   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3090   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3091     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3092     PetscFunctionReturn(0);
3093   }
3094 
3095   /* if (ispar) */
3096   /*
3097     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3098     These are used to extract the off-diag portion of the resulting parallel matrix.
3099     The row IS for the off-diag portion is the same as for the diag portion,
3100     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3101   */
3102   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3103   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3104   for (i = 0, ii = 0; i < ismax; ++i) {
3105     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3106     if (isize > 1) {
3107       /*
3108 	 TODO: This is the part that's ***NOT SCALABLE***.
3109 	 To fix this we need to extract just the indices of C's nonzero columns
3110 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3111 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3112 	 to be done without serializing on the IS list, so, most likely, it is best
3113 	 done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3114       */
3115       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3116       /* Now we have to
3117 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3118 	     were sorted on each rank, concatenated they might no longer be sorted;
3119 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3120 	     indices in the nondecreasing order to the original index positions.
3121 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3122       */
3123       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3124       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3125       ++ii;
3126     }
3127   }
3128   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3129   for (i = 0, ii = 0; i < ismax; ++i) {
3130     PetscInt       j,issize;
3131     const PetscInt *indices;
3132 
3133     /*
3134        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3135      */
3136     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3137     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3138     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3139     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3140     for (j = 1; j < issize; ++j) {
3141       if (indices[j] == indices[j-1]) {
3142 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3143       }
3144     }
3145     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3146 
3147 
3148     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3149     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3150     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3151     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3152     for (j = 1; j < issize; ++j) {
3153       if (indices[j-1] == indices[j]) {
3154 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3155       }
3156     }
3157     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3158     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3159     if (isize > 1) {
3160       cisrow[ii] = isrow[i];
3161       ++ii;
3162     }
3163   }
3164   /*
3165     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3166     array of sequential matrices underlying the resulting parallel matrices.
3167     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3168     contain duplicates.
3169 
3170     There are as many diag matrices as there are original index sets. There are only as many parallel
3171     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3172 
3173     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3174     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3175       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3176       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3177       with A[i] and B[ii] extracted from the corresponding MPI submat.
3178     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3179       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3180       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3181       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3182       values into A[i] and B[ii] sitting inside the corresponding submat.
3183     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3184       A[i], B[ii], AA[i] or BB[ii] matrices.
3185   */
3186   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3187   if (scall == MAT_INITIAL_MATRIX) {
3188     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3189   }
3190 
3191   /* Now obtain the sequential A and B submatrices separately. */
3192   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3193   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3194   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3195 
3196   /*
3197     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3198     matrices A & B have been extracted directly into the parallel matrices containing them, or
3199     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3200     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3201     to have the same sparsity pattern.
3202     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3203     must be constructed for C. This is done by setseqmat(s).
3204   */
3205   for (i = 0, ii = 0; i < ismax; ++i) {
3206     /*
3207        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3208        That way we can avoid sorting and computing permutations when reusing.
3209        To this end:
3210         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3211 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3212 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3213     */
3214     MatStructure pattern;
3215     pattern = DIFFERENT_NONZERO_PATTERN;
3216 
3217     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3218     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3219     if (isize > 1) {
3220       if (scall == MAT_INITIAL_MATRIX) {
3221 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3222 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3223 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3224 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3225 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3226       }
3227       /*
3228 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3229       */
3230       {
3231 	Mat AA,BB;
3232         AA = A[i];
3233         BB = B[ii];
3234 	if (AA || BB) {
3235 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3236 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3237 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3238 	}
3239 
3240         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3241       }
3242       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3243       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3244       ++ii;
3245     } else { /* if (isize == 1) */
3246       if (scall == MAT_REUSE_MATRIX) {
3247         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3248       }
3249       if (isrow_p[i] || iscol_p[i]) {
3250         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3251         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3252 	/* Otherwise A is extracted straight into (*submats)[i]. */
3253 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3254 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3255       } else (*submat)[i] = A[i];
3256     }
3257     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3258     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3259   }
3260   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3261   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3262   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3263   ierr = PetscFree(A);CHKERRQ(ierr);
3264   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3265   PetscFunctionReturn(0);
3266 }
3267 
3268 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3269 {
3270   PetscErrorCode ierr;
3271 
3272   PetscFunctionBegin;
3273   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3274   PetscFunctionReturn(0);
3275 }
3276