xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision ef5145865fe2b3e7ccc5a94f2cf2a99b672211a9)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1037 /*
1038     Every processor gets the entire matrix
1039 */
1040 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1041 {
1042   Mat            B;
1043   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1044   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1045   PetscErrorCode ierr;
1046   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1047   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1048   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1049   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1050 
1051   PetscFunctionBegin;
1052   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1053   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1054 
1055   if (scall == MAT_INITIAL_MATRIX) {
1056     /* ----------------------------------------------------------------
1057          Tell every processor the number of nonzeros per row
1058     */
1059     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1060     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1061       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1062     }
1063     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1064     for (i=0; i<size; i++) {
1065       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1066       displs[i]     = A->rmap->range[i];
1067     }
1068 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1069     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1070 #else
1071     sendcount = A->rmap->rend - A->rmap->rstart;
1072     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1073 #endif
1074     /* ---------------------------------------------------------------
1075          Create the sequential matrix of the same type as the local block diagonal
1076     */
1077     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1078     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1079     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1080     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1081     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1082     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1083     **Bin = B;
1084     b     = (Mat_SeqAIJ*)B->data;
1085 
1086     /*--------------------------------------------------------------------
1087        Copy my part of matrix column indices over
1088     */
1089     sendcount  = ad->nz + bd->nz;
1090     jsendbuf   = b->j + b->i[rstarts[rank]];
1091     a_jsendbuf = ad->j;
1092     b_jsendbuf = bd->j;
1093     n          = A->rmap->rend - A->rmap->rstart;
1094     cnt        = 0;
1095     for (i=0; i<n; i++) {
1096 
1097       /* put in lower diagonal portion */
1098       m = bd->i[i+1] - bd->i[i];
1099       while (m > 0) {
1100         /* is it above diagonal (in bd (compressed) numbering) */
1101         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1102         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1103         m--;
1104       }
1105 
1106       /* put in diagonal portion */
1107       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1108         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1109       }
1110 
1111       /* put in upper diagonal portion */
1112       while (m-- > 0) {
1113         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1114       }
1115     }
1116     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1117 
1118     /*--------------------------------------------------------------------
1119        Gather all column indices to all processors
1120     */
1121     for (i=0; i<size; i++) {
1122       recvcounts[i] = 0;
1123       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1124         recvcounts[i] += lens[j];
1125       }
1126     }
1127     displs[0] = 0;
1128     for (i=1; i<size; i++) {
1129       displs[i] = displs[i-1] + recvcounts[i-1];
1130     }
1131 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1132     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1133 #else
1134     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1135 #endif
1136     /*--------------------------------------------------------------------
1137         Assemble the matrix into useable form (note numerical values not yet set)
1138     */
1139     /* set the b->ilen (length of each row) values */
1140     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1141     /* set the b->i indices */
1142     b->i[0] = 0;
1143     for (i=1; i<=A->rmap->N; i++) {
1144       b->i[i] = b->i[i-1] + lens[i-1];
1145     }
1146     ierr = PetscFree(lens);CHKERRQ(ierr);
1147     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1148     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149 
1150   } else {
1151     B = **Bin;
1152     b = (Mat_SeqAIJ*)B->data;
1153   }
1154 
1155   /*--------------------------------------------------------------------
1156        Copy my part of matrix numerical values into the values location
1157   */
1158   if (flag == MAT_GET_VALUES) {
1159     sendcount = ad->nz + bd->nz;
1160     sendbuf   = b->a + b->i[rstarts[rank]];
1161     a_sendbuf = ad->a;
1162     b_sendbuf = bd->a;
1163     b_sendj   = bd->j;
1164     n         = A->rmap->rend - A->rmap->rstart;
1165     cnt       = 0;
1166     for (i=0; i<n; i++) {
1167 
1168       /* put in lower diagonal portion */
1169       m = bd->i[i+1] - bd->i[i];
1170       while (m > 0) {
1171         /* is it above diagonal (in bd (compressed) numbering) */
1172         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1173         sendbuf[cnt++] = *b_sendbuf++;
1174         m--;
1175         b_sendj++;
1176       }
1177 
1178       /* put in diagonal portion */
1179       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1180         sendbuf[cnt++] = *a_sendbuf++;
1181       }
1182 
1183       /* put in upper diagonal portion */
1184       while (m-- > 0) {
1185         sendbuf[cnt++] = *b_sendbuf++;
1186         b_sendj++;
1187       }
1188     }
1189     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1190 
1191     /* -----------------------------------------------------------------
1192        Gather all numerical values to all processors
1193     */
1194     if (!recvcounts) {
1195       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1196     }
1197     for (i=0; i<size; i++) {
1198       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1199     }
1200     displs[0] = 0;
1201     for (i=1; i<size; i++) {
1202       displs[i] = displs[i-1] + recvcounts[i-1];
1203     }
1204     recvbuf = b->a;
1205 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1206     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1207 #else
1208     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1209 #endif
1210   }  /* endof (flag == MAT_GET_VALUES) */
1211   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1212 
1213   if (A->symmetric) {
1214     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1215   } else if (A->hermitian) {
1216     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1217   } else if (A->structurally_symmetric) {
1218     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1219   }
1220   PetscFunctionReturn(0);
1221 }
1222 
1223 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1224 {
1225   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1226   Mat            submat,A = c->A,B = c->B;
1227   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1228   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1229   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1230   const PetscInt *icol,*irow;
1231   PetscInt       nrow,ncol,start;
1232   PetscErrorCode ierr;
1233   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1234   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1235   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1236   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1237   PetscInt       *lens,rmax,ncols,*cols,Crow;
1238 #if defined(PETSC_USE_CTABLE)
1239   PetscTable     cmap,rmap;
1240   PetscInt       *cmap_loc,*rmap_loc;
1241 #else
1242   PetscInt       *cmap,*rmap;
1243 #endif
1244   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1245   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1246   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1247   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1248   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1249   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1250   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1251   MPI_Comm       comm;
1252   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1253   PetscMPIInt    *onodes1,*olengths1,idex,end;
1254   Mat_SubSppt    *smatis1;
1255   PetscBool      isrowsorted;
1256 
1257   PetscFunctionBegin;
1258   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1259 
1260   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1261   size = c->size;
1262   rank = c->rank;
1263 
1264   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1265   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1266 
1267   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1268   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1269   if (allcolumns) {
1270     icol = NULL;
1271     ncol = C->cmap->N;
1272   } else {
1273     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1274     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1275   }
1276 
1277   if (scall == MAT_INITIAL_MATRIX) {
1278     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1279 
1280     //printf("[%d] ncol %d, Cn %d, CN %d\n",rank,ncol,C->cmap->n,C->cmap->N);
1281 
1282     /* Get some new tags to keep the communication clean */
1283     tag1 = ((PetscObject)C)->tag;
1284     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1285     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1286 
1287     /* evaluate communication - mesg to who, length of mesg, and buffer space
1288      required. Based on this, buffers are allocated, and data copied into them */
1289     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1290     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1291 
1292     /* w1[proc] = num of rows owned by proc -- to be requested */
1293     proc = 0;
1294     nrqs = 0; /* num of outgoing messages */
1295     for (j=0; j<nrow; j++) {
1296       row  = irow[j]; /* sorted! */
1297       while (row >= C->rmap->range[proc+1]) proc++;
1298       w1[proc]++;
1299       row2proc[j] = proc; /* map row index to proc */
1300 
1301       if (proc != rank && !w2[proc]) {
1302         w2[proc] = 1; nrqs++;
1303       }
1304     }
1305     w1[rank] = 0;  /* rows owned by self will not be requested */
1306 
1307     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1308     for (proc=0,j=0; proc<size; proc++) {
1309       if (w1[proc]) { pa[j++] = proc;}
1310     }
1311 
1312     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1313     msz = 0;              /* total mesg length (for all procs) */
1314     for (i=0; i<nrqs; i++) {
1315       proc      = pa[i];
1316       w1[proc] += 3;
1317       msz      += w1[proc];
1318     }
1319     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1320 
1321     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1322     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1323     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1324 
1325     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1326        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1327     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1328 
1329     /* Now post the Irecvs corresponding to these messages */
1330     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1331 
1332     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1333     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1334 
1335     /* Allocate Memory for outgoing messages */
1336     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1337     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1338     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1339 
1340     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1341     iptr = tmp;
1342     for (i=0; i<nrqs; i++) {
1343       proc        = pa[i];
1344       sbuf1[proc] = iptr;
1345       iptr       += w1[proc];
1346     }
1347 
1348     /* Form the outgoing messages */
1349     /* Initialize the header space */
1350     for (i=0; i<nrqs; i++) {
1351       proc      = pa[i];
1352       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1353       ptr[proc] = sbuf1[proc] + 3;
1354     }
1355 
1356     /* Parse the isrow and copy data into outbuf */
1357     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1358     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1359       proc = row2proc[j];
1360       if (proc != rank) { /* copy to the outgoing buf*/
1361         *ptr[proc] = irow[j];
1362         ctr[proc]++; ptr[proc]++;
1363       }
1364     }
1365 
1366     /* Update the headers for the current IS */
1367     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1368       if ((ctr_j = ctr[j])) {
1369         sbuf1_j        = sbuf1[j];
1370         k              = ++sbuf1_j[0];
1371         sbuf1_j[2*k]   = ctr_j;
1372         sbuf1_j[2*k-1] = 0;
1373       }
1374     }
1375 
1376     /* Now post the sends */
1377     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1378     for (i=0; i<nrqs; ++i) {
1379       proc = pa[i];
1380       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1381     }
1382 
1383     /* Post Receives to capture the buffer size */
1384     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1385     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1386 
1387     rbuf2[0] = tmp + msz;
1388     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1389 
1390     for (i=0; i<nrqs; ++i) {
1391       proc = pa[i];
1392       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1393     }
1394 
1395     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1396 
1397     /* Send to other procs the buf size they should allocate */
1398     /* Receive messages*/
1399     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1400     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1401 
1402     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1403     for (i=0; i<nrqr; ++i) {
1404       req_size[i] = 0;
1405       rbuf1_i        = rbuf1[i];
1406       start          = 2*rbuf1_i[0] + 1;
1407       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1408       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1409       sbuf2_i        = sbuf2[i];
1410       for (j=start; j<end; j++) {
1411         k            = rbuf1_i[j] - rstart;
1412         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1413         sbuf2_i[j]   = ncols;
1414         req_size[i] += ncols;
1415       }
1416       req_source1[i] = r_status1[i].MPI_SOURCE;
1417 
1418       /* form the header */
1419       sbuf2_i[0] = req_size[i];
1420       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1421 
1422       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1423     }
1424 
1425     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1426     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1427 
1428     /* rbuf2 is received, Post recv column indices a->j */
1429     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1430 
1431     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1432     for (i=0; i<nrqs; ++i) {
1433       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1434       req_source2[i] = r_status2[i].MPI_SOURCE;
1435       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1436     }
1437 
1438     /* Wait on sends1 and sends2 */
1439     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1440     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1441     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1442     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1443 
1444     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1445     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1446 
1447     /* Now allocate sending buffers for a->j, and send them off */
1448     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1449     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1450     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1451     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1452 
1453     for (i=0; i<nrqr; i++) { /* for each requested message */
1454       rbuf1_i   = rbuf1[i];
1455       sbuf_aj_i = sbuf_aj[i];
1456       ct1       = 2*rbuf1_i[0] + 1;
1457       ct2       = 0;
1458       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1459 
1460       kmax = rbuf1[i][2];
1461       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1462         row    = rbuf1_i[ct1] - rstart;
1463         nzA    = ai[row+1] - ai[row];
1464         nzB    = bi[row+1] - bi[row];
1465         ncols  = nzA + nzB;
1466         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1467 
1468         /* load the column indices for this row into cols*/
1469         cols = sbuf_aj_i + ct2;
1470 
1471         lwrite = 0;
1472         for (l=0; l<nzB; l++) {
1473           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1474         }
1475         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1476         for (l=0; l<nzB; l++) {
1477           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1478         }
1479 
1480         ct2 += ncols;
1481       }
1482       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1483     }
1484 
1485     /* create column map (cmap): global col of C -> local col of submat */
1486 #if defined(PETSC_USE_CTABLE)
1487     if (!allcolumns) {
1488       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1489       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1490       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1491         if (icol[j] >= cstart && icol[j] <cend) {
1492           cmap_loc[icol[j] - cstart] = j+1;
1493         } else { /* use PetscTable for non-local col indices */
1494           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1495         }
1496       }
1497     } else {
1498       cmap     = NULL;
1499       cmap_loc = NULL;
1500     }
1501     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1502 #else
1503     if (!allcolumns) {
1504       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1505       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1506     } else {
1507       cmap = NULL;
1508     }
1509 #endif
1510 
1511     /* Create lens for MatSeqAIJSetPreallocation() */
1512     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1513 
1514     /* Compute lens from local part of C */
1515     for (j=0; j<nrow; j++) {
1516       row  = irow[j];
1517       proc = row2proc[j];
1518       if (proc == rank) {
1519         /* diagonal part A = c->A */
1520         ncols = ai[row-rstart+1] - ai[row-rstart];
1521         cols  = aj + ai[row-rstart];
1522         if (!allcolumns) {
1523           for (k=0; k<ncols; k++) {
1524 #if defined(PETSC_USE_CTABLE)
1525             tcol = cmap_loc[cols[k]];
1526 #else
1527             tcol = cmap[cols[k]+cstart];
1528 #endif
1529             if (tcol) lens[j]++;
1530           }
1531         } else { /* allcolumns */
1532           lens[j] = ncols;
1533         }
1534 
1535         /* off-diagonal part B = c->B */
1536         ncols = bi[row-rstart+1] - bi[row-rstart];
1537         cols  = bj + bi[row-rstart];
1538         if (!allcolumns) {
1539           for (k=0; k<ncols; k++) {
1540 #if defined(PETSC_USE_CTABLE)
1541             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1542 #else
1543             tcol = cmap[bmap[cols[k]]];
1544 #endif
1545             if (tcol) lens[j]++;
1546           }
1547         } else { /* allcolumns */
1548           lens[j] += ncols;
1549         }
1550       }
1551     }
1552 
1553     /* Create row map (rmap): global row of C -> local row of submat */
1554 #if defined(PETSC_USE_CTABLE)
1555     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1556     for (j=0; j<nrow; j++) {
1557       row  = irow[j];
1558       proc = row2proc[j];
1559       if (proc == rank) { /* a local row */
1560         rmap_loc[row - rstart] = j;
1561       } else {
1562         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1563       }
1564     }
1565 #else
1566     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1567     for (j=0; j<nrow; j++) {
1568       rmap[irow[j]] = j;
1569     }
1570 #endif
1571 
1572     /* Update lens from offproc data */
1573     /* recv a->j is done */
1574     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1575     for (i=0; i<nrqs; i++) {
1576       proc    = pa[i];
1577       sbuf1_i = sbuf1[proc];
1578       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1579       ct1     = 2 + 1;
1580       ct2     = 0;
1581       rbuf2_i = rbuf2[i]; /* received length of C->j */
1582       rbuf3_i = rbuf3[i]; /* received C->j */
1583 
1584       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1585       max1   = sbuf1_i[2];
1586       for (k=0; k<max1; k++,ct1++) {
1587 #if defined(PETSC_USE_CTABLE)
1588         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1589         row--;
1590         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1591 #else
1592         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1593 #endif
1594         /* Now, store row index of submat in sbuf1_i[ct1] */
1595         sbuf1_i[ct1] = row;
1596 
1597         nnz = rbuf2_i[ct1];
1598         if (!allcolumns) {
1599           for (l=0; l<nnz; l++,ct2++) {
1600 #if defined(PETSC_USE_CTABLE)
1601             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1602               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1603             } else {
1604               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1605             }
1606 #else
1607             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1608 #endif
1609             if (tcol) lens[row]++;
1610           }
1611         } else { /* allcolumns */
1612           lens[row] += nnz;
1613         }
1614       }
1615     }
1616     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1617     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1618 
1619     /* Create the submatrices */
1620     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1621     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1622 
1623     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1624     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1625     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1626     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1627     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1628 
1629     /* create struct Mat_SubSppt and attached it to submat */
1630     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1631     subc = (Mat_SeqAIJ*)submat->data;
1632     subc->submatis1 = smatis1;
1633 
1634     smatis1->id          = 0;
1635     smatis1->nrqs        = nrqs;
1636     smatis1->nrqr        = nrqr;
1637     smatis1->rbuf1       = rbuf1;
1638     smatis1->rbuf2       = rbuf2;
1639     smatis1->rbuf3       = rbuf3;
1640     smatis1->sbuf2       = sbuf2;
1641     smatis1->req_source2 = req_source2;
1642 
1643     smatis1->sbuf1       = sbuf1;
1644     smatis1->ptr         = ptr;
1645     smatis1->tmp         = tmp;
1646     smatis1->ctr         = ctr;
1647 
1648     smatis1->pa           = pa;
1649     smatis1->req_size     = req_size;
1650     smatis1->req_source1  = req_source1;
1651 
1652     smatis1->allcolumns  = allcolumns;
1653     smatis1->singleis    = PETSC_TRUE;
1654     smatis1->row2proc    = row2proc;
1655     smatis1->rmap        = rmap;
1656     smatis1->cmap        = cmap;
1657 #if defined(PETSC_USE_CTABLE)
1658     smatis1->rmap_loc    = rmap_loc;
1659     smatis1->cmap_loc    = cmap_loc;
1660 #endif
1661 
1662     smatis1->destroy     = submat->ops->destroy;
1663     submat->ops->destroy = MatDestroy_SeqAIJ_Submatrices;
1664     submat->factortype   = C->factortype;
1665 
1666     /* compute rmax */
1667     rmax = 0;
1668     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1669 
1670   } else { /* scall == MAT_REUSE_MATRIX */
1671     submat = submats[0];
1672     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1673 
1674     subc    = (Mat_SeqAIJ*)submat->data;
1675     rmax    = subc->rmax;
1676     smatis1 = subc->submatis1;
1677     nrqs        = smatis1->nrqs;
1678     nrqr        = smatis1->nrqr;
1679     rbuf1       = smatis1->rbuf1;
1680     rbuf2       = smatis1->rbuf2;
1681     rbuf3       = smatis1->rbuf3;
1682     req_source2 = smatis1->req_source2;
1683 
1684     sbuf1     = smatis1->sbuf1;
1685     sbuf2     = smatis1->sbuf2;
1686     ptr       = smatis1->ptr;
1687     tmp       = smatis1->tmp;
1688     ctr       = smatis1->ctr;
1689 
1690     pa         = smatis1->pa;
1691     req_size   = smatis1->req_size;
1692     req_source1 = smatis1->req_source1;
1693 
1694     allcolumns = smatis1->allcolumns;
1695     row2proc   = smatis1->row2proc;
1696     rmap       = smatis1->rmap;
1697     cmap       = smatis1->cmap;
1698 #if defined(PETSC_USE_CTABLE)
1699     rmap_loc   = smatis1->rmap_loc;
1700     cmap_loc   = smatis1->cmap_loc;
1701 #endif
1702   }
1703 
1704   /* Post recv matrix values */
1705   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1706   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1707   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1708   for (i=0; i<nrqs; ++i) {
1709     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1710     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1711   }
1712 
1713   /* Allocate sending buffers for a->a, and send them off */
1714   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1715   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1716   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1717   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1718 
1719   for (i=0; i<nrqr; i++) {
1720     rbuf1_i   = rbuf1[i];
1721     sbuf_aa_i = sbuf_aa[i];
1722     ct1       = 2*rbuf1_i[0]+1;
1723     ct2       = 0;
1724     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1725 
1726     kmax = rbuf1_i[2];
1727     for (k=0; k<kmax; k++,ct1++) {
1728       row = rbuf1_i[ct1] - rstart;
1729       nzA = ai[row+1] - ai[row];
1730       nzB = bi[row+1] - bi[row];
1731       ncols  = nzA + nzB;
1732       cworkB = bj + bi[row];
1733       vworkA = a_a + ai[row];
1734       vworkB = b_a + bi[row];
1735 
1736       /* load the column values for this row into vals*/
1737       vals = sbuf_aa_i + ct2;
1738 
1739       lwrite = 0;
1740       for (l=0; l<nzB; l++) {
1741         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1742       }
1743       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1744       for (l=0; l<nzB; l++) {
1745         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1746       }
1747 
1748       ct2 += ncols;
1749     }
1750     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1751   }
1752 
1753   /* Assemble submat */
1754   /* First assemble the local rows */
1755   for (j=0; j<nrow; j++) {
1756     row  = irow[j];
1757     proc = row2proc[j];
1758     if (proc == rank) {
1759       Crow = row - rstart;  /* local row index of C */
1760 #if defined(PETSC_USE_CTABLE)
1761       row = rmap_loc[Crow]; /* row index of submat */
1762 #else
1763       row = rmap[row];
1764 #endif
1765 
1766       if (allcolumns) {
1767         /* diagonal part A = c->A */
1768         ncols = ai[Crow+1] - ai[Crow];
1769         cols  = aj + ai[Crow];
1770         vals  = a->a + ai[Crow];
1771         i     = 0;
1772         for (k=0; k<ncols; k++) {
1773           subcols[i]   = cols[k] + cstart;
1774           subvals[i++] = vals[k];
1775         }
1776 
1777         /* off-diagonal part B = c->B */
1778         ncols = bi[Crow+1] - bi[Crow];
1779         cols  = bj + bi[Crow];
1780         vals  = b->a + bi[Crow];
1781         for (k=0; k<ncols; k++) {
1782           subcols[i]   = bmap[cols[k]];
1783           subvals[i++] = vals[k];
1784         }
1785 
1786         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1787 
1788       } else { /* !allcolumns */
1789 #if defined(PETSC_USE_CTABLE)
1790         /* diagonal part A = c->A */
1791         ncols = ai[Crow+1] - ai[Crow];
1792         cols  = aj + ai[Crow];
1793         vals  = a->a + ai[Crow];
1794         i     = 0;
1795         for (k=0; k<ncols; k++) {
1796           tcol = cmap_loc[cols[k]];
1797           if (tcol) {
1798             subcols[i]   = --tcol;
1799             subvals[i++] = vals[k];
1800           }
1801         }
1802 
1803         /* off-diagonal part B = c->B */
1804         ncols = bi[Crow+1] - bi[Crow];
1805         cols  = bj + bi[Crow];
1806         vals  = b->a + bi[Crow];
1807         for (k=0; k<ncols; k++) {
1808           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1809           if (tcol) {
1810             subcols[i]   = --tcol;
1811             subvals[i++] = vals[k];
1812           }
1813         }
1814 #else
1815         /* diagonal part A = c->A */
1816         ncols = ai[Crow+1] - ai[Crow];
1817         cols  = aj + ai[Crow];
1818         vals  = a->a + ai[Crow];
1819         i     = 0;
1820         for (k=0; k<ncols; k++) {
1821           tcol = cmap[cols[k]+cstart];
1822           if (tcol) {
1823             subcols[i]   = --tcol;
1824             subvals[i++] = vals[k];
1825           }
1826         }
1827 
1828         /* off-diagonal part B = c->B */
1829         ncols = bi[Crow+1] - bi[Crow];
1830         cols  = bj + bi[Crow];
1831         vals  = b->a + bi[Crow];
1832         for (k=0; k<ncols; k++) {
1833           tcol = cmap[bmap[cols[k]]];
1834           if (tcol) {
1835             subcols[i]   = --tcol;
1836             subvals[i++] = vals[k];
1837           }
1838         }
1839 #endif
1840         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1841       }
1842     }
1843   }
1844 
1845   /* Now assemble the off-proc rows */
1846   for (i=0; i<nrqs; i++) { /* for each requested message */
1847     /* recv values from other processes */
1848     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1849     proc    = pa[idex];
1850     sbuf1_i = sbuf1[proc];
1851     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1852     ct1     = 2 + 1;
1853     ct2     = 0; /* count of received C->j */
1854     ct3     = 0; /* count of received C->j that will be inserted into submat */
1855     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1856     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1857     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1858 
1859     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1860     max1 = sbuf1_i[2];             /* num of rows */
1861     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1862       row = sbuf1_i[ct1]; /* row index of submat */
1863       if (!allcolumns) {
1864         idex = 0;
1865         if (scall == MAT_INITIAL_MATRIX) {
1866           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1867           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1868 #if defined(PETSC_USE_CTABLE)
1869             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1870               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1871             } else {
1872               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1873             }
1874 #else
1875             tcol = cmap[rbuf3_i[ct2]];
1876 #endif
1877             if (tcol) {
1878               subcols[idex]   = --tcol;
1879               subvals[idex++] = rbuf4_i[ct2];
1880 
1881               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1882                For reuse, we replace received C->j with index that should be inserted to submat */
1883               rbuf3_i[ct3++] = ct2;
1884             }
1885           }
1886           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1887 
1888         } else { /* scall == MAT_REUSE_MATRIX */
1889           submat = submats[0];
1890           subc   = (Mat_SeqAIJ*)submat->data;
1891 
1892           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1893           for (l=0; l<nnz; l++) {
1894             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1895             subvals[idex++] = rbuf4_i[ct2];
1896           }
1897 
1898           bj = subc->j + subc->i[row];
1899           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1900         }
1901       } else { /* allcolumns */
1902         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1903         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1904         ct2 += nnz;
1905       }
1906     }
1907   }
1908 
1909   /* sending a->a are done */
1910   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1911   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1912 
1913   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1914   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1915   submats[0] = submat;
1916 
1917   /* Restore the indices */
1918   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1919   if (!allcolumns) {
1920     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1921   }
1922 
1923   /* Destroy allocated memory */
1924   for (i=0; i<nrqs; ++i) {
1925     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1926   }
1927   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1928   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1929   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1930 
1931   if (scall == MAT_INITIAL_MATRIX) {
1932     ierr = PetscFree(lens);CHKERRQ(ierr);
1933     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1934     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1935   }
1936   PetscFunctionReturn(0);
1937 }
1938 
1939 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1940 {
1941   PetscErrorCode ierr;
1942   PetscInt       ncol;
1943   PetscBool      colflag,allcolumns=PETSC_FALSE;
1944 
1945   PetscFunctionBegin;
1946   /* Allocate memory to hold all the submatrices */
1947   if (scall == MAT_INITIAL_MATRIX) {
1948     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1949   }
1950 
1951   /* Check for special case: each processor gets entire matrix columns */
1952   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1953   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1954   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1955 
1956   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1957   PetscFunctionReturn(0);
1958 }
1959 
1960 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1961 {
1962   PetscErrorCode ierr;
1963   PetscInt       nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2];
1964   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1965   Mat_SeqAIJ     *subc;
1966   Mat_SubSppt    *smat;
1967 
1968   PetscFunctionBegin;
1969   /* Check for special case: each processor has a single IS */
1970   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1971     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1972     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1973     PetscFunctionReturn(0);
1974   }
1975 
1976   /* Collect global wantallmatrix and nstages */
1977   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1978   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1979   if (!nmax) nmax = 1;
1980 
1981   if (scall == MAT_INITIAL_MATRIX) {
1982     /* Collect global wantallmatrix and nstages */
1983     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1984       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1985       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1986       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1987       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1988       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1989         wantallmatrix = PETSC_TRUE;
1990 
1991         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1992       }
1993     }
1994 
1995     /* Determine the number of stages through which submatrices are done
1996        Each stage will extract nmax submatrices.
1997        nmax is determined by the matrix column dimension.
1998        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1999     */
2000     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2001 
2002     in[0] = -1*(PetscInt)wantallmatrix;
2003     in[1] = nstages;
2004     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2005     wantallmatrix = (PetscBool)(-out[0]);
2006     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2007 
2008   } else { /* MAT_REUSE_MATRIX */
2009     if (ismax) {
2010       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2011       smat = subc->submatis1;
2012     } else { /* (*submat)[0] is a dummy matrix */
2013       smat = (Mat_SubSppt*)(*submat)[0]->data;
2014     }
2015     if (!smat) {
2016       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2017       wantallmatrix = PETSC_TRUE;
2018     } else if (smat->singleis) {
2019       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2020       PetscFunctionReturn(0);
2021     } else {
2022       nstages = smat->nstages;
2023     }
2024   }
2025 
2026   if (wantallmatrix) {
2027     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2028     PetscFunctionReturn(0);
2029   }
2030 
2031   /* Allocate memory to hold all the submatrices and dummy submatrices */
2032   if (scall == MAT_INITIAL_MATRIX) {
2033     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2034   }
2035 
2036   for (i=0,pos=0; i<nstages; i++) {
2037     if (pos+nmax <= ismax) max_no = nmax;
2038     else if (pos == ismax) max_no = 0;
2039     else                   max_no = ismax-pos;
2040 
2041     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2042     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2043       smat = (Mat_SubSppt*)(*submat)[pos]->data;
2044       smat->nstages = nstages;
2045     }
2046     pos += max_no;
2047   }
2048 
2049   if (ismax && scall == MAT_INITIAL_MATRIX) {
2050     /* save nstages for reuse */
2051     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2052     smat = subc->submatis1;
2053     smat->nstages = nstages;
2054   }
2055   PetscFunctionReturn(0);
2056 }
2057 
2058 /* -------------------------------------------------------------------------*/
2059 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2060 {
2061   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2062   Mat            A  = c->A;
2063   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2064   const PetscInt **icol,**irow;
2065   PetscInt       *nrow,*ncol,start;
2066   PetscErrorCode ierr;
2067   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2068   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2069   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2070   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2071   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2072 #if defined(PETSC_USE_CTABLE)
2073   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2074 #else
2075   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2076 #endif
2077   const PetscInt *irow_i;
2078   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2079   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2080   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2081   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2082   MPI_Status     *r_status3,*r_status4,*s_status4;
2083   MPI_Comm       comm;
2084   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2085   PetscMPIInt    *onodes1,*olengths1,end;
2086   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2087   Mat_SubSppt    *smat_i;
2088   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2089   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2090 
2091   PetscFunctionBegin;
2092   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2093   size = c->size;
2094   rank = c->rank;
2095 
2096   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2097   ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2098 
2099   for (i=0; i<ismax; i++) {
2100     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2101     if (!issorted[i]) iscsorted = issorted[i];
2102 
2103     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2104 
2105     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2106     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2107 
2108     /* Check for special case: allcolumn */
2109     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2110     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2111     if (colflag && ncol[i] == C->cmap->N) {
2112       allcolumns[i] = PETSC_TRUE;
2113       icol[i] = NULL;
2114     } else {
2115       allcolumns[i] = PETSC_FALSE;
2116       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2117     }
2118   }
2119 
2120   if (scall == MAT_REUSE_MATRIX) {
2121     /* Assumes new rows are same length as the old rows */
2122     for (i=0; i<ismax; i++) {
2123       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2124       subc = (Mat_SeqAIJ*)submats[i]->data;
2125       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2126 
2127       /* Initial matrix as if empty */
2128       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2129 
2130       smat_i   = subc->submatis1;
2131 
2132       nrqs        = smat_i->nrqs;
2133       nrqr        = smat_i->nrqr;
2134       rbuf1       = smat_i->rbuf1;
2135       rbuf2       = smat_i->rbuf2;
2136       rbuf3       = smat_i->rbuf3;
2137       req_source2 = smat_i->req_source2;
2138 
2139       sbuf1     = smat_i->sbuf1;
2140       sbuf2     = smat_i->sbuf2;
2141       ptr       = smat_i->ptr;
2142       tmp       = smat_i->tmp;
2143       ctr       = smat_i->ctr;
2144 
2145       pa          = smat_i->pa;
2146       req_size    = smat_i->req_size;
2147       req_source1 = smat_i->req_source1;
2148 
2149       allcolumns[i] = smat_i->allcolumns;
2150       row2proc[i]   = smat_i->row2proc;
2151       rmap[i]       = smat_i->rmap;
2152       cmap[i]       = smat_i->cmap;
2153     }
2154 
2155     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2156       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2157       smat_i = (Mat_SubSppt*)submats[0]->data;
2158 
2159       nrqs        = smat_i->nrqs;
2160       nrqr        = smat_i->nrqr;
2161       rbuf1       = smat_i->rbuf1;
2162       rbuf2       = smat_i->rbuf2;
2163       rbuf3       = smat_i->rbuf3;
2164       req_source2 = smat_i->req_source2;
2165 
2166       sbuf1       = smat_i->sbuf1;
2167       sbuf2       = smat_i->sbuf2;
2168       ptr         = smat_i->ptr;
2169       tmp         = smat_i->tmp;
2170       ctr         = smat_i->ctr;
2171 
2172       pa          = smat_i->pa;
2173       req_size    = smat_i->req_size;
2174       req_source1 = smat_i->req_source1;
2175 
2176       allcolumns[0] = PETSC_FALSE;
2177     }
2178   } else { /* scall == MAT_INITIAL_MATRIX */
2179     /* Get some new tags to keep the communication clean */
2180     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2181     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2182 
2183     /* evaluate communication - mesg to who, length of mesg, and buffer space
2184      required. Based on this, buffers are allocated, and data copied into them*/
2185     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2186 
2187     for (i=0; i<ismax; i++) {
2188       jmax   = nrow[i];
2189       irow_i = irow[i];
2190 
2191       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2192       row2proc[i] = row2proc_i;
2193 
2194       if (issorted[i]) proc = 0;
2195       for (j=0; j<jmax; j++) {
2196         if (!issorted[i]) proc = 0;
2197         row = irow_i[j];
2198         while (row >= C->rmap->range[proc+1]) proc++;
2199         w4[proc]++;
2200         row2proc_i[j] = proc; /* map row index to proc */
2201       }
2202       for (j=0; j<size; j++) {
2203         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2204       }
2205     }
2206 
2207     nrqs     = 0;              /* no of outgoing messages */
2208     msz      = 0;              /* total mesg length (for all procs) */
2209     w1[rank] = 0;              /* no mesg sent to self */
2210     w3[rank] = 0;
2211     for (i=0; i<size; i++) {
2212       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2213     }
2214     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2215     for (i=0,j=0; i<size; i++) {
2216       if (w1[i]) { pa[j] = i; j++; }
2217     }
2218 
2219     /* Each message would have a header = 1 + 2*(no of IS) + data */
2220     for (i=0; i<nrqs; i++) {
2221       j      = pa[i];
2222       w1[j] += w2[j] + 2* w3[j];
2223       msz   += w1[j];
2224     }
2225     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2226 
2227     /* Determine the number of messages to expect, their lengths, from from-ids */
2228     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2229     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2230 
2231     /* Now post the Irecvs corresponding to these messages */
2232     tag0 = ((PetscObject)C)->tag;
2233     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2234 
2235     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2236     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2237 
2238     /* Allocate Memory for outgoing messages */
2239     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2240     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2241     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2242 
2243     {
2244       PetscInt *iptr = tmp;
2245       k    = 0;
2246       for (i=0; i<nrqs; i++) {
2247         j        = pa[i];
2248         iptr    += k;
2249         sbuf1[j] = iptr;
2250         k        = w1[j];
2251       }
2252     }
2253 
2254     /* Form the outgoing messages. Initialize the header space */
2255     for (i=0; i<nrqs; i++) {
2256       j           = pa[i];
2257       sbuf1[j][0] = 0;
2258       ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2259       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2260     }
2261 
2262     /* Parse the isrow and copy data into outbuf */
2263     for (i=0; i<ismax; i++) {
2264       row2proc_i = row2proc[i];
2265       ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2266       irow_i = irow[i];
2267       jmax   = nrow[i];
2268       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2269         proc = row2proc_i[j];
2270         if (proc != rank) { /* copy to the outgoing buf*/
2271           ctr[proc]++;
2272           *ptr[proc] = irow_i[j];
2273           ptr[proc]++;
2274         }
2275       }
2276       /* Update the headers for the current IS */
2277       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2278         if ((ctr_j = ctr[j])) {
2279           sbuf1_j        = sbuf1[j];
2280           k              = ++sbuf1_j[0];
2281           sbuf1_j[2*k]   = ctr_j;
2282           sbuf1_j[2*k-1] = i;
2283         }
2284       }
2285     }
2286 
2287     /*  Now  post the sends */
2288     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2289     for (i=0; i<nrqs; ++i) {
2290       j    = pa[i];
2291       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2292     }
2293 
2294     /* Post Receives to capture the buffer size */
2295     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2296     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2297     rbuf2[0] = tmp + msz;
2298     for (i=1; i<nrqs; ++i) {
2299       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2300     }
2301     for (i=0; i<nrqs; ++i) {
2302       j    = pa[i];
2303       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2304     }
2305 
2306     /* Send to other procs the buf size they should allocate */
2307     /* Receive messages*/
2308     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2309     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2310     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2311     {
2312       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2313       PetscInt   *sbuf2_i;
2314 
2315       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2316       for (i=0; i<nrqr; ++i) {
2317         req_size[i] = 0;
2318         rbuf1_i        = rbuf1[i];
2319         start          = 2*rbuf1_i[0] + 1;
2320         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2321         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2322         sbuf2_i        = sbuf2[i];
2323         for (j=start; j<end; j++) {
2324           id              = rbuf1_i[j] - rstart;
2325           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2326           sbuf2_i[j]      = ncols;
2327           req_size[i] += ncols;
2328         }
2329         req_source1[i] = r_status1[i].MPI_SOURCE;
2330         /* form the header */
2331         sbuf2_i[0] = req_size[i];
2332         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2333 
2334         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2335       }
2336     }
2337     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2338     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2339     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2340 
2341     /* Receive messages*/
2342     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2343     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2344 
2345     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2346     for (i=0; i<nrqs; ++i) {
2347       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2348       req_source2[i] = r_status2[i].MPI_SOURCE;
2349       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2350     }
2351     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2352     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2353 
2354     /* Wait on sends1 and sends2 */
2355     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2356     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2357 
2358     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2359     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2360     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2361     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2362     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2363     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2364 
2365     /* Now allocate sending buffers for a->j, and send them off */
2366     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2367     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2368     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2369     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2370 
2371     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2372     {
2373       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2374       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2375       PetscInt cend = C->cmap->rend;
2376       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2377 
2378       for (i=0; i<nrqr; i++) {
2379         rbuf1_i   = rbuf1[i];
2380         sbuf_aj_i = sbuf_aj[i];
2381         ct1       = 2*rbuf1_i[0] + 1;
2382         ct2       = 0;
2383         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2384           kmax = rbuf1[i][2*j];
2385           for (k=0; k<kmax; k++,ct1++) {
2386             row    = rbuf1_i[ct1] - rstart;
2387             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2388             ncols  = nzA + nzB;
2389             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2390 
2391             /* load the column indices for this row into cols */
2392             cols = sbuf_aj_i + ct2;
2393 
2394             lwrite = 0;
2395             for (l=0; l<nzB; l++) {
2396               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2397             }
2398             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2399             for (l=0; l<nzB; l++) {
2400               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2401             }
2402 
2403             ct2 += ncols;
2404           }
2405         }
2406         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2407       }
2408     }
2409     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2410 
2411     /* create col map: global col of C -> local col of submatrices */
2412     {
2413       const PetscInt *icol_i;
2414 #if defined(PETSC_USE_CTABLE)
2415       for (i=0; i<ismax; i++) {
2416         if (!allcolumns[i]) {
2417           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2418 
2419           jmax   = ncol[i];
2420           icol_i = icol[i];
2421           cmap_i = cmap[i];
2422           for (j=0; j<jmax; j++) {
2423             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2424           }
2425         } else cmap[i] = NULL;
2426       }
2427 #else
2428       for (i=0; i<ismax; i++) {
2429         if (!allcolumns[i]) {
2430           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2431           jmax   = ncol[i];
2432           icol_i = icol[i];
2433           cmap_i = cmap[i];
2434           for (j=0; j<jmax; j++) {
2435             cmap_i[icol_i[j]] = j+1;
2436           }
2437         } else cmap[i] = NULL;
2438       }
2439 #endif
2440     }
2441 
2442     /* Create lens which is required for MatCreate... */
2443     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2444     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2445 
2446     if (ismax) {
2447       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2448     }
2449     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2450 
2451     /* Update lens from local data */
2452     for (i=0; i<ismax; i++) {
2453       row2proc_i = row2proc[i];
2454       jmax = nrow[i];
2455       if (!allcolumns[i]) cmap_i = cmap[i];
2456       irow_i = irow[i];
2457       lens_i = lens[i];
2458       for (j=0; j<jmax; j++) {
2459         row = irow_i[j];
2460         proc = row2proc_i[j];
2461         if (proc == rank) {
2462           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2463           if (!allcolumns[i]) {
2464             for (k=0; k<ncols; k++) {
2465 #if defined(PETSC_USE_CTABLE)
2466               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2467 #else
2468               tcol = cmap_i[cols[k]];
2469 #endif
2470               if (tcol) lens_i[j]++;
2471             }
2472           } else { /* allcolumns */
2473             lens_i[j] = ncols;
2474           }
2475           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2476         }
2477       }
2478     }
2479 
2480     /* Create row map: global row of C -> local row of submatrices */
2481 #if defined(PETSC_USE_CTABLE)
2482     for (i=0; i<ismax; i++) {
2483       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2484       irow_i = irow[i];
2485       jmax   = nrow[i];
2486       for (j=0; j<jmax; j++) {
2487       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2488       }
2489     }
2490 #else
2491     for (i=0; i<ismax; i++) {
2492       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2493       rmap_i = rmap[i];
2494       irow_i = irow[i];
2495       jmax   = nrow[i];
2496       for (j=0; j<jmax; j++) {
2497         rmap_i[irow_i[j]] = j;
2498       }
2499     }
2500 #endif
2501 
2502     /* Update lens from offproc data */
2503     {
2504       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2505 
2506       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2507       for (tmp2=0; tmp2<nrqs; tmp2++) {
2508         sbuf1_i = sbuf1[pa[tmp2]];
2509         jmax    = sbuf1_i[0];
2510         ct1     = 2*jmax+1;
2511         ct2     = 0;
2512         rbuf2_i = rbuf2[tmp2];
2513         rbuf3_i = rbuf3[tmp2];
2514         for (j=1; j<=jmax; j++) {
2515           is_no  = sbuf1_i[2*j-1];
2516           max1   = sbuf1_i[2*j];
2517           lens_i = lens[is_no];
2518           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2519           rmap_i = rmap[is_no];
2520           for (k=0; k<max1; k++,ct1++) {
2521 #if defined(PETSC_USE_CTABLE)
2522             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2523             row--;
2524             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2525 #else
2526             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2527 #endif
2528             max2 = rbuf2_i[ct1];
2529             for (l=0; l<max2; l++,ct2++) {
2530               if (!allcolumns[is_no]) {
2531 #if defined(PETSC_USE_CTABLE)
2532                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2533 #else
2534                 tcol = cmap_i[rbuf3_i[ct2]];
2535 #endif
2536                 if (tcol) lens_i[row]++;
2537               } else { /* allcolumns */
2538                 lens_i[row]++; /* lens_i[row] += max2 ? */
2539               }
2540             }
2541           }
2542         }
2543       }
2544     }
2545     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2546     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2547     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2548     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2549 
2550     /* Create the submatrices */
2551     for (i=0; i<ismax; i++) {
2552       PetscInt    rbs,cbs;
2553 
2554       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2555       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2556 
2557       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2558       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2559 
2560       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2561       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2562       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2563 
2564       /* create struct Mat_SubSppt and attached it to submat */
2565       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2566       subc = (Mat_SeqAIJ*)submats[i]->data;
2567       subc->submatis1 = smat_i;
2568 
2569       smat_i->destroy          = submats[i]->ops->destroy;
2570       submats[i]->ops->destroy = MatDestroy_SeqAIJ_Submatrices;
2571       submats[i]->factortype   = C->factortype;
2572 
2573       smat_i->id          = i;
2574       smat_i->nrqs        = nrqs;
2575       smat_i->nrqr        = nrqr;
2576       smat_i->rbuf1       = rbuf1;
2577       smat_i->rbuf2       = rbuf2;
2578       smat_i->rbuf3       = rbuf3;
2579       smat_i->sbuf2       = sbuf2;
2580       smat_i->req_source2 = req_source2;
2581 
2582       smat_i->sbuf1       = sbuf1;
2583       smat_i->ptr         = ptr;
2584       smat_i->tmp         = tmp;
2585       smat_i->ctr         = ctr;
2586 
2587       smat_i->pa           = pa;
2588       smat_i->req_size     = req_size;
2589       smat_i->req_source1  = req_source1;
2590 
2591       smat_i->allcolumns  = allcolumns[i];
2592       smat_i->singleis    = PETSC_FALSE;
2593       smat_i->row2proc    = row2proc[i];
2594       smat_i->rmap        = rmap[i];
2595       smat_i->cmap        = cmap[i];
2596     }
2597 
2598     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2599       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2600       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2601       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2602 
2603       /* create struct Mat_SubSppt and attached it to submat */
2604       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2605       submats[0]->data = (void*)smat_i;
2606 
2607       smat_i->destroy          = submats[0]->ops->destroy;
2608       submats[0]->ops->destroy = MatDestroy_Dummy_Submatrices;
2609       submats[0]->factortype   = C->factortype;
2610 
2611       smat_i->id          = 0;
2612       smat_i->nrqs        = nrqs;
2613       smat_i->nrqr        = nrqr;
2614       smat_i->rbuf1       = rbuf1;
2615       smat_i->rbuf2       = rbuf2;
2616       smat_i->rbuf3       = rbuf3;
2617       smat_i->sbuf2       = sbuf2;
2618       smat_i->req_source2 = req_source2;
2619 
2620       smat_i->sbuf1       = sbuf1;
2621       smat_i->ptr         = ptr;
2622       smat_i->tmp         = tmp;
2623       smat_i->ctr         = ctr;
2624 
2625       smat_i->pa           = pa;
2626       smat_i->req_size     = req_size;
2627       smat_i->req_source1  = req_source1;
2628 
2629       smat_i->allcolumns  = PETSC_FALSE;
2630       smat_i->singleis    = PETSC_FALSE;
2631       smat_i->row2proc    = NULL;
2632       smat_i->rmap        = NULL;
2633       smat_i->cmap        = NULL;
2634     }
2635 
2636     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2637     ierr = PetscFree(lens);CHKERRQ(ierr);
2638     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2639     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2640 
2641   } /* endof scall == MAT_INITIAL_MATRIX */
2642 
2643   /* Post recv matrix values */
2644   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2645   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2646   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2647   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2648   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2649   for (i=0; i<nrqs; ++i) {
2650     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2651     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2652   }
2653 
2654   /* Allocate sending buffers for a->a, and send them off */
2655   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2656   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2657   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2658   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2659 
2660   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2661   {
2662     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2663     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2664     PetscInt    cend   = C->cmap->rend;
2665     PetscInt    *b_j   = b->j;
2666     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2667 
2668     for (i=0; i<nrqr; i++) {
2669       rbuf1_i   = rbuf1[i];
2670       sbuf_aa_i = sbuf_aa[i];
2671       ct1       = 2*rbuf1_i[0]+1;
2672       ct2       = 0;
2673       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2674         kmax = rbuf1_i[2*j];
2675         for (k=0; k<kmax; k++,ct1++) {
2676           row    = rbuf1_i[ct1] - rstart;
2677           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2678           ncols  = nzA + nzB;
2679           cworkB = b_j + b_i[row];
2680           vworkA = a_a + a_i[row];
2681           vworkB = b_a + b_i[row];
2682 
2683           /* load the column values for this row into vals*/
2684           vals = sbuf_aa_i+ct2;
2685 
2686           lwrite = 0;
2687           for (l=0; l<nzB; l++) {
2688             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2689           }
2690           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2691           for (l=0; l<nzB; l++) {
2692             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2693           }
2694 
2695           ct2 += ncols;
2696         }
2697       }
2698       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2699     }
2700   }
2701 
2702   /* Assemble the matrices */
2703   /* First assemble the local rows */
2704   for (i=0; i<ismax; i++) {
2705     row2proc_i = row2proc[i];
2706     subc      = (Mat_SeqAIJ*)submats[i]->data;
2707     imat_ilen = subc->ilen;
2708     imat_j    = subc->j;
2709     imat_i    = subc->i;
2710     imat_a    = subc->a;
2711 
2712     if (!allcolumns[i]) cmap_i = cmap[i];
2713     rmap_i = rmap[i];
2714     irow_i = irow[i];
2715     jmax   = nrow[i];
2716     for (j=0; j<jmax; j++) {
2717       row  = irow_i[j];
2718       proc = row2proc_i[j];
2719       if (proc == rank) {
2720         old_row = row;
2721 #if defined(PETSC_USE_CTABLE)
2722         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2723         row--;
2724 #else
2725         row = rmap_i[row];
2726 #endif
2727         ilen_row = imat_ilen[row];
2728         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2729         mat_i    = imat_i[row];
2730         mat_a    = imat_a + mat_i;
2731         mat_j    = imat_j + mat_i;
2732         if (!allcolumns[i]) {
2733           for (k=0; k<ncols; k++) {
2734 #if defined(PETSC_USE_CTABLE)
2735             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2736 #else
2737             tcol = cmap_i[cols[k]];
2738 #endif
2739             if (tcol) {
2740               *mat_j++ = tcol - 1;
2741               *mat_a++ = vals[k];
2742               ilen_row++;
2743             }
2744           }
2745         } else { /* allcolumns */
2746           for (k=0; k<ncols; k++) {
2747             *mat_j++ = cols[k];  /* global col index! */
2748             *mat_a++ = vals[k];
2749             ilen_row++;
2750           }
2751         }
2752         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2753 
2754         imat_ilen[row] = ilen_row;
2755       }
2756     }
2757   }
2758 
2759   /* Now assemble the off proc rows */
2760   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2761   for (tmp2=0; tmp2<nrqs; tmp2++) {
2762     sbuf1_i = sbuf1[pa[tmp2]];
2763     jmax    = sbuf1_i[0];
2764     ct1     = 2*jmax + 1;
2765     ct2     = 0;
2766     rbuf2_i = rbuf2[tmp2];
2767     rbuf3_i = rbuf3[tmp2];
2768     rbuf4_i = rbuf4[tmp2];
2769     for (j=1; j<=jmax; j++) {
2770       is_no     = sbuf1_i[2*j-1];
2771       rmap_i    = rmap[is_no];
2772       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2773       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2774       imat_ilen = subc->ilen;
2775       imat_j    = subc->j;
2776       imat_i    = subc->i;
2777       imat_a    = subc->a;
2778       max1      = sbuf1_i[2*j];
2779       for (k=0; k<max1; k++,ct1++) {
2780         row = sbuf1_i[ct1];
2781 #if defined(PETSC_USE_CTABLE)
2782         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2783         row--;
2784 #else
2785         row = rmap_i[row];
2786 #endif
2787         ilen  = imat_ilen[row];
2788         mat_i = imat_i[row];
2789         mat_a = imat_a + mat_i;
2790         mat_j = imat_j + mat_i;
2791         max2  = rbuf2_i[ct1];
2792         if (!allcolumns[is_no]) {
2793           for (l=0; l<max2; l++,ct2++) {
2794 #if defined(PETSC_USE_CTABLE)
2795             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2796 #else
2797             tcol = cmap_i[rbuf3_i[ct2]];
2798 #endif
2799             if (tcol) {
2800               *mat_j++ = tcol - 1;
2801               *mat_a++ = rbuf4_i[ct2];
2802               ilen++;
2803             }
2804           }
2805         } else { /* allcolumns */
2806           for (l=0; l<max2; l++,ct2++) {
2807             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2808             *mat_a++ = rbuf4_i[ct2];
2809             ilen++;
2810           }
2811         }
2812         imat_ilen[row] = ilen;
2813       }
2814     }
2815   }
2816 
2817   if (!iscsorted) { /* sort column indices of the rows */
2818     for (i=0; i<ismax; i++) {
2819       subc      = (Mat_SeqAIJ*)submats[i]->data;
2820       imat_j    = subc->j;
2821       imat_i    = subc->i;
2822       imat_a    = subc->a;
2823       imat_ilen = subc->ilen;
2824 
2825       if (allcolumns[i]) continue;
2826       jmax = nrow[i];
2827       for (j=0; j<jmax; j++) {
2828         mat_i = imat_i[j];
2829         mat_a = imat_a + mat_i;
2830         mat_j = imat_j + mat_i;
2831         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2832       }
2833     }
2834   }
2835 
2836   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2837   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2838   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2839   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2840   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2841 
2842   /* Restore the indices */
2843   for (i=0; i<ismax; i++) {
2844     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2845     if (!allcolumns[i]) {
2846       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2847     }
2848   }
2849 
2850   for (i=0; i<ismax; i++) {
2851     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2852     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2853   }
2854 
2855   /* Destroy allocated memory */
2856   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2857   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2858   ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr);
2859 
2860   for (i=0; i<nrqs; ++i) {
2861     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2862   }
2863   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2864 
2865   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2866   PetscFunctionReturn(0);
2867 }
2868 
2869 /*
2870  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2871  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2872  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2873  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2874  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2875  state, and needs to be "assembled" later by compressing B's column space.
2876 
2877  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2878  Following this call, C->A & C->B have been created, even if empty.
2879  */
2880 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2881 {
2882   /* If making this function public, change the error returned in this function away from _PLIB. */
2883   PetscErrorCode ierr;
2884   Mat_MPIAIJ     *aij;
2885   Mat_SeqAIJ     *Baij;
2886   PetscBool      seqaij,Bdisassembled;
2887   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2888   PetscScalar    v;
2889   const PetscInt *rowindices,*colindices;
2890 
2891   PetscFunctionBegin;
2892   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2893   if (A) {
2894     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2895     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2896     if (rowemb) {
2897       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2898       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2899     } else {
2900       if (C->rmap->n != A->rmap->n) {
2901 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2902       }
2903     }
2904     if (dcolemb) {
2905       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2906       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2907     } else {
2908       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2909     }
2910   }
2911   if (B) {
2912     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2913     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2914     if (rowemb) {
2915       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2916       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2917     } else {
2918       if (C->rmap->n != B->rmap->n) {
2919 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2920       }
2921     }
2922     if (ocolemb) {
2923       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2924       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2925     } else {
2926       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2927     }
2928   }
2929 
2930   aij = (Mat_MPIAIJ*)C->data;
2931   if (!aij->A) {
2932     /* Mimic parts of MatMPIAIJSetPreallocation() */
2933     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2934     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2935     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2936     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2937     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2938   }
2939   if (A) {
2940     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2941   } else {
2942     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2943   }
2944   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2945     /*
2946       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2947       need to "disassemble" B -- convert it to using C's global indices.
2948       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2949 
2950       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2951       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2952 
2953       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2954       At least avoid calling MatSetValues() and the implied searches?
2955     */
2956 
2957     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2958 #if defined(PETSC_USE_CTABLE)
2959       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2960 #else
2961       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2962       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2963       if (aij->B) {
2964         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2965       }
2966 #endif
2967       ngcol = 0;
2968       if (aij->lvec) {
2969 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2970       }
2971       if (aij->garray) {
2972 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2973 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2974       }
2975       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2976       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2977     }
2978     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2979       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2980     }
2981     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2982       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2983     }
2984   }
2985   Bdisassembled = PETSC_FALSE;
2986   if (!aij->B) {
2987     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2988     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2989     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2990     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2991     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2992     Bdisassembled = PETSC_TRUE;
2993   }
2994   if (B) {
2995     Baij = (Mat_SeqAIJ*)B->data;
2996     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2997       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2998       for (i=0; i<B->rmap->n; i++) {
2999 	nz[i] = Baij->i[i+1] - Baij->i[i];
3000       }
3001       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3002       ierr = PetscFree(nz);CHKERRQ(ierr);
3003     }
3004 
3005     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3006     shift = rend-rstart;
3007     count = 0;
3008     rowindices = NULL;
3009     colindices = NULL;
3010     if (rowemb) {
3011       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3012     }
3013     if (ocolemb) {
3014       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3015     }
3016     for (i=0; i<B->rmap->n; i++) {
3017       PetscInt row;
3018       row = i;
3019       if (rowindices) row = rowindices[i];
3020       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3021 	col  = Baij->j[count];
3022 	if (colindices) col = colindices[col];
3023 	if (Bdisassembled && col>=rstart) col += shift;
3024 	v    = Baij->a[count];
3025 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3026 	++count;
3027       }
3028     }
3029     /* No assembly for aij->B is necessary. */
3030     /* FIXME: set aij->B's nonzerostate correctly. */
3031   } else {
3032     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3033   }
3034   C->preallocated  = PETSC_TRUE;
3035   C->was_assembled = PETSC_FALSE;
3036   C->assembled     = PETSC_FALSE;
3037    /*
3038       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3039       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3040    */
3041   PetscFunctionReturn(0);
3042 }
3043 
3044 /*
3045   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3046  */
3047 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3048 {
3049   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3050 
3051   PetscFunctionBegin;
3052   PetscValidPointer(A,2);
3053   PetscValidPointer(B,3);
3054   /* FIXME: make sure C is assembled */
3055   *A = aij->A;
3056   *B = aij->B;
3057   /* Note that we don't incref *A and *B, so be careful! */
3058   PetscFunctionReturn(0);
3059 }
3060 
3061 /*
3062   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3063   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3064 */
3065 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3066                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3067 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3068 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3069 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3070 {
3071   PetscErrorCode ierr;
3072   PetscMPIInt    isize,flag;
3073   PetscInt       i,ii,cismax,ispar;
3074   Mat            *A,*B;
3075   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3076 
3077   PetscFunctionBegin;
3078   if (!ismax) PetscFunctionReturn(0);
3079 
3080   for (i = 0, cismax = 0; i < ismax; ++i) {
3081     PetscMPIInt isize;
3082     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3083     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3084     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3085     if (isize > 1) ++cismax;
3086   }
3087 
3088   /*
3089      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3090      ispar counts the number of parallel ISs across C's comm.
3091   */
3092   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3093   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3094     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3095     PetscFunctionReturn(0);
3096   }
3097 
3098   /* if (ispar) */
3099   /*
3100     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3101     These are used to extract the off-diag portion of the resulting parallel matrix.
3102     The row IS for the off-diag portion is the same as for the diag portion,
3103     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3104   */
3105   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3106   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3107   for (i = 0, ii = 0; i < ismax; ++i) {
3108     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3109     if (isize > 1) {
3110       /*
3111 	 TODO: This is the part that's ***NOT SCALABLE***.
3112 	 To fix this we need to extract just the indices of C's nonzero columns
3113 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3114 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3115 	 to be done without serializing on the IS list, so, most likely, it is best
3116 	 done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3117       */
3118       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3119       /* Now we have to
3120 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3121 	     were sorted on each rank, concatenated they might no longer be sorted;
3122 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3123 	     indices in the nondecreasing order to the original index positions.
3124 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3125       */
3126       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3127       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3128       ++ii;
3129     }
3130   }
3131   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3132   for (i = 0, ii = 0; i < ismax; ++i) {
3133     PetscInt       j,issize;
3134     const PetscInt *indices;
3135 
3136     /*
3137        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3138      */
3139     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3140     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3141     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3142     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3143     for (j = 1; j < issize; ++j) {
3144       if (indices[j] == indices[j-1]) {
3145 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3146       }
3147     }
3148     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3149 
3150 
3151     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3152     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3153     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3154     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3155     for (j = 1; j < issize; ++j) {
3156       if (indices[j-1] == indices[j]) {
3157 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3158       }
3159     }
3160     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3161     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3162     if (isize > 1) {
3163       cisrow[ii] = isrow[i];
3164       ++ii;
3165     }
3166   }
3167   /*
3168     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3169     array of sequential matrices underlying the resulting parallel matrices.
3170     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3171     contain duplicates.
3172 
3173     There are as many diag matrices as there are original index sets. There are only as many parallel
3174     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3175 
3176     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3177     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3178       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3179       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3180       with A[i] and B[ii] extracted from the corresponding MPI submat.
3181     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3182       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3183       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3184       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3185       values into A[i] and B[ii] sitting inside the corresponding submat.
3186     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3187       A[i], B[ii], AA[i] or BB[ii] matrices.
3188   */
3189   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3190   if (scall == MAT_INITIAL_MATRIX) {
3191     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3192   }
3193 
3194   /* Now obtain the sequential A and B submatrices separately. */
3195   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3196   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3197   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3198 
3199   /*
3200     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3201     matrices A & B have been extracted directly into the parallel matrices containing them, or
3202     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3203     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3204     to have the same sparsity pattern.
3205     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3206     must be constructed for C. This is done by setseqmat(s).
3207   */
3208   for (i = 0, ii = 0; i < ismax; ++i) {
3209     /*
3210        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3211        That way we can avoid sorting and computing permutations when reusing.
3212        To this end:
3213         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3214 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3215 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3216     */
3217     MatStructure pattern;
3218     pattern = DIFFERENT_NONZERO_PATTERN;
3219 
3220     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3221     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3222     if (isize > 1) {
3223       if (scall == MAT_INITIAL_MATRIX) {
3224 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3225 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3226 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3227 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3228 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3229       }
3230       /*
3231 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3232       */
3233       {
3234 	Mat AA,BB;
3235         AA = A[i];
3236         BB = B[ii];
3237 	if (AA || BB) {
3238 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3239 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3240 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3241 	}
3242 
3243         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3244       }
3245       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3246       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3247       ++ii;
3248     } else { /* if (isize == 1) */
3249       if (scall == MAT_REUSE_MATRIX) {
3250         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3251       }
3252       if (isrow_p[i] || iscol_p[i]) {
3253         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3254         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3255 	/* Otherwise A is extracted straight into (*submats)[i]. */
3256 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3257 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3258       } else (*submat)[i] = A[i];
3259     }
3260     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3261     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3262   }
3263   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3264   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3265   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3266   ierr = PetscFree(A);CHKERRQ(ierr);
3267   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3268   PetscFunctionReturn(0);
3269 }
3270 
3271 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3272 {
3273   PetscErrorCode ierr;
3274 
3275   PetscFunctionBegin;
3276   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3277   PetscFunctionReturn(0);
3278 }
3279