xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision a30ec4eab21047868b4ea34d315622f953eb6ffa)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode ierr;
52   MPI_Comm       comm;
53   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt *indices_i,**indices;
57   PetscLayout    rmap;
58   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF        sf;
60   PetscSFNode    *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1037 /*
1038     Every processor gets the entire matrix
1039 */
1040 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1041 {
1042   Mat            B;
1043   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1044   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1045   PetscErrorCode ierr;
1046   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1047   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1048   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1049   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1050 
1051   PetscFunctionBegin;
1052   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1053   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1054 
1055   if (scall == MAT_INITIAL_MATRIX) {
1056     /* ----------------------------------------------------------------
1057          Tell every processor the number of nonzeros per row
1058     */
1059     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1060     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1061       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1062     }
1063     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1064     for (i=0; i<size; i++) {
1065       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1066       displs[i]     = A->rmap->range[i];
1067     }
1068 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1069     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1070 #else
1071     sendcount = A->rmap->rend - A->rmap->rstart;
1072     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1073 #endif
1074     /* ---------------------------------------------------------------
1075          Create the sequential matrix of the same type as the local block diagonal
1076     */
1077     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1078     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1079     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1080     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1081     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1082     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1083     **Bin = B;
1084     b     = (Mat_SeqAIJ*)B->data;
1085 
1086     /*--------------------------------------------------------------------
1087        Copy my part of matrix column indices over
1088     */
1089     sendcount  = ad->nz + bd->nz;
1090     jsendbuf   = b->j + b->i[rstarts[rank]];
1091     a_jsendbuf = ad->j;
1092     b_jsendbuf = bd->j;
1093     n          = A->rmap->rend - A->rmap->rstart;
1094     cnt        = 0;
1095     for (i=0; i<n; i++) {
1096 
1097       /* put in lower diagonal portion */
1098       m = bd->i[i+1] - bd->i[i];
1099       while (m > 0) {
1100         /* is it above diagonal (in bd (compressed) numbering) */
1101         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1102         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1103         m--;
1104       }
1105 
1106       /* put in diagonal portion */
1107       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1108         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1109       }
1110 
1111       /* put in upper diagonal portion */
1112       while (m-- > 0) {
1113         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1114       }
1115     }
1116     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1117 
1118     /*--------------------------------------------------------------------
1119        Gather all column indices to all processors
1120     */
1121     for (i=0; i<size; i++) {
1122       recvcounts[i] = 0;
1123       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1124         recvcounts[i] += lens[j];
1125       }
1126     }
1127     displs[0] = 0;
1128     for (i=1; i<size; i++) {
1129       displs[i] = displs[i-1] + recvcounts[i-1];
1130     }
1131 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1132     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1133 #else
1134     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1135 #endif
1136     /*--------------------------------------------------------------------
1137         Assemble the matrix into useable form (note numerical values not yet set)
1138     */
1139     /* set the b->ilen (length of each row) values */
1140     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1141     /* set the b->i indices */
1142     b->i[0] = 0;
1143     for (i=1; i<=A->rmap->N; i++) {
1144       b->i[i] = b->i[i-1] + lens[i-1];
1145     }
1146     ierr = PetscFree(lens);CHKERRQ(ierr);
1147     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1148     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149 
1150   } else {
1151     B = **Bin;
1152     b = (Mat_SeqAIJ*)B->data;
1153   }
1154 
1155   /*--------------------------------------------------------------------
1156        Copy my part of matrix numerical values into the values location
1157   */
1158   if (flag == MAT_GET_VALUES) {
1159     sendcount = ad->nz + bd->nz;
1160     sendbuf   = b->a + b->i[rstarts[rank]];
1161     a_sendbuf = ad->a;
1162     b_sendbuf = bd->a;
1163     b_sendj   = bd->j;
1164     n         = A->rmap->rend - A->rmap->rstart;
1165     cnt       = 0;
1166     for (i=0; i<n; i++) {
1167 
1168       /* put in lower diagonal portion */
1169       m = bd->i[i+1] - bd->i[i];
1170       while (m > 0) {
1171         /* is it above diagonal (in bd (compressed) numbering) */
1172         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1173         sendbuf[cnt++] = *b_sendbuf++;
1174         m--;
1175         b_sendj++;
1176       }
1177 
1178       /* put in diagonal portion */
1179       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1180         sendbuf[cnt++] = *a_sendbuf++;
1181       }
1182 
1183       /* put in upper diagonal portion */
1184       while (m-- > 0) {
1185         sendbuf[cnt++] = *b_sendbuf++;
1186         b_sendj++;
1187       }
1188     }
1189     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1190 
1191     /* -----------------------------------------------------------------
1192        Gather all numerical values to all processors
1193     */
1194     if (!recvcounts) {
1195       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1196     }
1197     for (i=0; i<size; i++) {
1198       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1199     }
1200     displs[0] = 0;
1201     for (i=1; i<size; i++) {
1202       displs[i] = displs[i-1] + recvcounts[i-1];
1203     }
1204     recvbuf = b->a;
1205 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1206     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1207 #else
1208     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1209 #endif
1210   }  /* endof (flag == MAT_GET_VALUES) */
1211   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1212 
1213   if (A->symmetric) {
1214     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1215   } else if (A->hermitian) {
1216     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1217   } else if (A->structurally_symmetric) {
1218     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1219   }
1220   PetscFunctionReturn(0);
1221 }
1222 
1223 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1224 {
1225   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1226   Mat            submat,A = c->A,B = c->B;
1227   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1228   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1229   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1230   const PetscInt *icol,*irow;
1231   PetscInt       nrow,ncol,start;
1232   PetscErrorCode ierr;
1233   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1234   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1235   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1236   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1237   PetscInt       *lens,rmax,ncols,*cols,Crow;
1238 #if defined(PETSC_USE_CTABLE)
1239   PetscTable     cmap,rmap;
1240   PetscInt       *cmap_loc,*rmap_loc;
1241 #else
1242   PetscInt       *cmap,*rmap;
1243 #endif
1244   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1245   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1246   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1247   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1248   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1249   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1250   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1251   MPI_Comm       comm;
1252   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1253   PetscMPIInt    *onodes1,*olengths1,idex,end;
1254   Mat_SubSppt    *smatis1;
1255   PetscBool      isrowsorted;
1256 
1257   PetscFunctionBegin;
1258   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1259 
1260   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1261   size = c->size;
1262   rank = c->rank;
1263 
1264   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1265   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1266 
1267   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1268   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1269   if (allcolumns) {
1270     icol = NULL;
1271     ncol = C->cmap->N;
1272   } else {
1273     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1274     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1275   }
1276 
1277   if (scall == MAT_INITIAL_MATRIX) {
1278     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1279 
1280     /* Get some new tags to keep the communication clean */
1281     tag1 = ((PetscObject)C)->tag;
1282     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1283     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1284 
1285     /* evaluate communication - mesg to who, length of mesg, and buffer space
1286      required. Based on this, buffers are allocated, and data copied into them */
1287     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1288     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1289 
1290     /* w1[proc] = num of rows owned by proc -- to be requested */
1291     proc = 0;
1292     nrqs = 0; /* num of outgoing messages */
1293     for (j=0; j<nrow; j++) {
1294       row  = irow[j]; /* sorted! */
1295       while (row >= C->rmap->range[proc+1]) proc++;
1296       w1[proc]++;
1297       row2proc[j] = proc; /* map row index to proc */
1298 
1299       if (proc != rank && !w2[proc]) {
1300         w2[proc] = 1; nrqs++;
1301       }
1302     }
1303     w1[rank] = 0;  /* rows owned by self will not be requested */
1304 
1305     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1306     for (proc=0,j=0; proc<size; proc++) {
1307       if (w1[proc]) { pa[j++] = proc;}
1308     }
1309 
1310     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1311     msz = 0;              /* total mesg length (for all procs) */
1312     for (i=0; i<nrqs; i++) {
1313       proc      = pa[i];
1314       w1[proc] += 3;
1315       msz      += w1[proc];
1316     }
1317     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1318 
1319     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1320     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1321     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1322 
1323     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1324        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1325     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1326 
1327     /* Now post the Irecvs corresponding to these messages */
1328     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1329 
1330     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1331     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1332 
1333     /* Allocate Memory for outgoing messages */
1334     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1335     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1336     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1337 
1338     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1339     iptr = tmp;
1340     for (i=0; i<nrqs; i++) {
1341       proc        = pa[i];
1342       sbuf1[proc] = iptr;
1343       iptr       += w1[proc];
1344     }
1345 
1346     /* Form the outgoing messages */
1347     /* Initialize the header space */
1348     for (i=0; i<nrqs; i++) {
1349       proc      = pa[i];
1350       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1351       ptr[proc] = sbuf1[proc] + 3;
1352     }
1353 
1354     /* Parse the isrow and copy data into outbuf */
1355     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1356     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1357       proc = row2proc[j];
1358       if (proc != rank) { /* copy to the outgoing buf*/
1359         *ptr[proc] = irow[j];
1360         ctr[proc]++; ptr[proc]++;
1361       }
1362     }
1363 
1364     /* Update the headers for the current IS */
1365     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1366       if ((ctr_j = ctr[j])) {
1367         sbuf1_j        = sbuf1[j];
1368         k              = ++sbuf1_j[0];
1369         sbuf1_j[2*k]   = ctr_j;
1370         sbuf1_j[2*k-1] = 0;
1371       }
1372     }
1373 
1374     /* Now post the sends */
1375     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1376     for (i=0; i<nrqs; ++i) {
1377       proc = pa[i];
1378       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1379     }
1380 
1381     /* Post Receives to capture the buffer size */
1382     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1383     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1384 
1385     rbuf2[0] = tmp + msz;
1386     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1387 
1388     for (i=0; i<nrqs; ++i) {
1389       proc = pa[i];
1390       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1391     }
1392 
1393     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1394 
1395     /* Send to other procs the buf size they should allocate */
1396     /* Receive messages*/
1397     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1398     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1399 
1400     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1401     for (i=0; i<nrqr; ++i) {
1402       req_size[i] = 0;
1403       rbuf1_i        = rbuf1[i];
1404       start          = 2*rbuf1_i[0] + 1;
1405       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1406       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1407       sbuf2_i        = sbuf2[i];
1408       for (j=start; j<end; j++) {
1409         k            = rbuf1_i[j] - rstart;
1410         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1411         sbuf2_i[j]   = ncols;
1412         req_size[i] += ncols;
1413       }
1414       req_source1[i] = r_status1[i].MPI_SOURCE;
1415 
1416       /* form the header */
1417       sbuf2_i[0] = req_size[i];
1418       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1419 
1420       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1421     }
1422 
1423     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1424     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1425 
1426     /* rbuf2 is received, Post recv column indices a->j */
1427     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1428 
1429     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1430     for (i=0; i<nrqs; ++i) {
1431       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1432       req_source2[i] = r_status2[i].MPI_SOURCE;
1433       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1434     }
1435 
1436     /* Wait on sends1 and sends2 */
1437     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1438     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1439     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1440     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1441 
1442     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1443     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1444 
1445     /* Now allocate sending buffers for a->j, and send them off */
1446     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1447     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1448     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1449     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1450 
1451     for (i=0; i<nrqr; i++) { /* for each requested message */
1452       rbuf1_i   = rbuf1[i];
1453       sbuf_aj_i = sbuf_aj[i];
1454       ct1       = 2*rbuf1_i[0] + 1;
1455       ct2       = 0;
1456       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1457 
1458       kmax = rbuf1[i][2];
1459       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1460         row    = rbuf1_i[ct1] - rstart;
1461         nzA    = ai[row+1] - ai[row];
1462         nzB    = bi[row+1] - bi[row];
1463         ncols  = nzA + nzB;
1464         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1465 
1466         /* load the column indices for this row into cols*/
1467         cols = sbuf_aj_i + ct2;
1468 
1469         lwrite = 0;
1470         for (l=0; l<nzB; l++) {
1471           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1472         }
1473         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1474         for (l=0; l<nzB; l++) {
1475           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1476         }
1477 
1478         ct2 += ncols;
1479       }
1480       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1481     }
1482 
1483     /* create column map (cmap): global col of C -> local col of submat */
1484 #if defined(PETSC_USE_CTABLE)
1485     if (!allcolumns) {
1486       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1487       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1488       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1489         if (icol[j] >= cstart && icol[j] <cend) {
1490           cmap_loc[icol[j] - cstart] = j+1;
1491         } else { /* use PetscTable for non-local col indices */
1492           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1493         }
1494       }
1495     } else {
1496       cmap     = NULL;
1497       cmap_loc = NULL;
1498     }
1499     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1500 #else
1501     if (!allcolumns) {
1502       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1503       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1504     } else {
1505       cmap = NULL;
1506     }
1507 #endif
1508 
1509     /* Create lens for MatSeqAIJSetPreallocation() */
1510     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1511 
1512     /* Compute lens from local part of C */
1513     for (j=0; j<nrow; j++) {
1514       row  = irow[j];
1515       proc = row2proc[j];
1516       if (proc == rank) {
1517         /* diagonal part A = c->A */
1518         ncols = ai[row-rstart+1] - ai[row-rstart];
1519         cols  = aj + ai[row-rstart];
1520         if (!allcolumns) {
1521           for (k=0; k<ncols; k++) {
1522 #if defined(PETSC_USE_CTABLE)
1523             tcol = cmap_loc[cols[k]];
1524 #else
1525             tcol = cmap[cols[k]+cstart];
1526 #endif
1527             if (tcol) lens[j]++;
1528           }
1529         } else { /* allcolumns */
1530           lens[j] = ncols;
1531         }
1532 
1533         /* off-diagonal part B = c->B */
1534         ncols = bi[row-rstart+1] - bi[row-rstart];
1535         cols  = bj + bi[row-rstart];
1536         if (!allcolumns) {
1537           for (k=0; k<ncols; k++) {
1538 #if defined(PETSC_USE_CTABLE)
1539             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1540 #else
1541             tcol = cmap[bmap[cols[k]]];
1542 #endif
1543             if (tcol) lens[j]++;
1544           }
1545         } else { /* allcolumns */
1546           lens[j] += ncols;
1547         }
1548       }
1549     }
1550 
1551     /* Create row map (rmap): global row of C -> local row of submat */
1552 #if defined(PETSC_USE_CTABLE)
1553     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1554     for (j=0; j<nrow; j++) {
1555       row  = irow[j];
1556       proc = row2proc[j];
1557       if (proc == rank) { /* a local row */
1558         rmap_loc[row - rstart] = j;
1559       } else {
1560         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1561       }
1562     }
1563 #else
1564     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1565     for (j=0; j<nrow; j++) {
1566       rmap[irow[j]] = j;
1567     }
1568 #endif
1569 
1570     /* Update lens from offproc data */
1571     /* recv a->j is done */
1572     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1573     for (i=0; i<nrqs; i++) {
1574       proc    = pa[i];
1575       sbuf1_i = sbuf1[proc];
1576       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1577       ct1     = 2 + 1;
1578       ct2     = 0;
1579       rbuf2_i = rbuf2[i]; /* received length of C->j */
1580       rbuf3_i = rbuf3[i]; /* received C->j */
1581 
1582       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1583       max1   = sbuf1_i[2];
1584       for (k=0; k<max1; k++,ct1++) {
1585 #if defined(PETSC_USE_CTABLE)
1586         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1587         row--;
1588         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1589 #else
1590         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1591 #endif
1592         /* Now, store row index of submat in sbuf1_i[ct1] */
1593         sbuf1_i[ct1] = row;
1594 
1595         nnz = rbuf2_i[ct1];
1596         if (!allcolumns) {
1597           for (l=0; l<nnz; l++,ct2++) {
1598 #if defined(PETSC_USE_CTABLE)
1599             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1600               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1601             } else {
1602               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1603             }
1604 #else
1605             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1606 #endif
1607             if (tcol) lens[row]++;
1608           }
1609         } else { /* allcolumns */
1610           lens[row] += nnz;
1611         }
1612       }
1613     }
1614     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1615     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1616 
1617     /* Create the submatrices */
1618     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1619     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1620 
1621     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1622     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1623     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1624     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1625     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1626 
1627     /* create struct Mat_SubSppt and attached it to submat */
1628     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1629     subc = (Mat_SeqAIJ*)submat->data;
1630     subc->submatis1 = smatis1;
1631 
1632     smatis1->id          = 0;
1633     smatis1->nrqs        = nrqs;
1634     smatis1->nrqr        = nrqr;
1635     smatis1->rbuf1       = rbuf1;
1636     smatis1->rbuf2       = rbuf2;
1637     smatis1->rbuf3       = rbuf3;
1638     smatis1->sbuf2       = sbuf2;
1639     smatis1->req_source2 = req_source2;
1640 
1641     smatis1->sbuf1       = sbuf1;
1642     smatis1->ptr         = ptr;
1643     smatis1->tmp         = tmp;
1644     smatis1->ctr         = ctr;
1645 
1646     smatis1->pa           = pa;
1647     smatis1->req_size     = req_size;
1648     smatis1->req_source1  = req_source1;
1649 
1650     smatis1->allcolumns  = allcolumns;
1651     smatis1->singleis    = PETSC_TRUE;
1652     smatis1->row2proc    = row2proc;
1653     smatis1->rmap        = rmap;
1654     smatis1->cmap        = cmap;
1655 #if defined(PETSC_USE_CTABLE)
1656     smatis1->rmap_loc    = rmap_loc;
1657     smatis1->cmap_loc    = cmap_loc;
1658 #endif
1659 
1660     smatis1->destroy     = submat->ops->destroy;
1661     submat->ops->destroy = MatDestroy_SeqAIJ_Submatrices;
1662     submat->factortype   = C->factortype;
1663 
1664     /* compute rmax */
1665     rmax = 0;
1666     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1667 
1668   } else { /* scall == MAT_REUSE_MATRIX */
1669     submat = submats[0];
1670     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1671 
1672     subc    = (Mat_SeqAIJ*)submat->data;
1673     rmax    = subc->rmax;
1674     smatis1 = subc->submatis1;
1675     nrqs        = smatis1->nrqs;
1676     nrqr        = smatis1->nrqr;
1677     rbuf1       = smatis1->rbuf1;
1678     rbuf2       = smatis1->rbuf2;
1679     rbuf3       = smatis1->rbuf3;
1680     req_source2 = smatis1->req_source2;
1681 
1682     sbuf1     = smatis1->sbuf1;
1683     sbuf2     = smatis1->sbuf2;
1684     ptr       = smatis1->ptr;
1685     tmp       = smatis1->tmp;
1686     ctr       = smatis1->ctr;
1687 
1688     pa         = smatis1->pa;
1689     req_size   = smatis1->req_size;
1690     req_source1 = smatis1->req_source1;
1691 
1692     allcolumns = smatis1->allcolumns;
1693     row2proc   = smatis1->row2proc;
1694     rmap       = smatis1->rmap;
1695     cmap       = smatis1->cmap;
1696 #if defined(PETSC_USE_CTABLE)
1697     rmap_loc   = smatis1->rmap_loc;
1698     cmap_loc   = smatis1->cmap_loc;
1699 #endif
1700   }
1701 
1702   /* Post recv matrix values */
1703   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1704   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1705   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1706   for (i=0; i<nrqs; ++i) {
1707     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1708     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1709   }
1710 
1711   /* Allocate sending buffers for a->a, and send them off */
1712   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1713   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1714   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1715   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1716 
1717   for (i=0; i<nrqr; i++) {
1718     rbuf1_i   = rbuf1[i];
1719     sbuf_aa_i = sbuf_aa[i];
1720     ct1       = 2*rbuf1_i[0]+1;
1721     ct2       = 0;
1722     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1723 
1724     kmax = rbuf1_i[2];
1725     for (k=0; k<kmax; k++,ct1++) {
1726       row = rbuf1_i[ct1] - rstart;
1727       nzA = ai[row+1] - ai[row];
1728       nzB = bi[row+1] - bi[row];
1729       ncols  = nzA + nzB;
1730       cworkB = bj + bi[row];
1731       vworkA = a_a + ai[row];
1732       vworkB = b_a + bi[row];
1733 
1734       /* load the column values for this row into vals*/
1735       vals = sbuf_aa_i + ct2;
1736 
1737       lwrite = 0;
1738       for (l=0; l<nzB; l++) {
1739         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1740       }
1741       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1742       for (l=0; l<nzB; l++) {
1743         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1744       }
1745 
1746       ct2 += ncols;
1747     }
1748     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1749   }
1750 
1751   /* Assemble submat */
1752   /* First assemble the local rows */
1753   for (j=0; j<nrow; j++) {
1754     row  = irow[j];
1755     proc = row2proc[j];
1756     if (proc == rank) {
1757       Crow = row - rstart;  /* local row index of C */
1758 #if defined(PETSC_USE_CTABLE)
1759       row = rmap_loc[Crow]; /* row index of submat */
1760 #else
1761       row = rmap[row];
1762 #endif
1763 
1764       if (allcolumns) {
1765         /* diagonal part A = c->A */
1766         ncols = ai[Crow+1] - ai[Crow];
1767         cols  = aj + ai[Crow];
1768         vals  = a->a + ai[Crow];
1769         i     = 0;
1770         for (k=0; k<ncols; k++) {
1771           subcols[i]   = cols[k] + cstart;
1772           subvals[i++] = vals[k];
1773         }
1774 
1775         /* off-diagonal part B = c->B */
1776         ncols = bi[Crow+1] - bi[Crow];
1777         cols  = bj + bi[Crow];
1778         vals  = b->a + bi[Crow];
1779         for (k=0; k<ncols; k++) {
1780           subcols[i]   = bmap[cols[k]];
1781           subvals[i++] = vals[k];
1782         }
1783 
1784         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1785 
1786       } else { /* !allcolumns */
1787 #if defined(PETSC_USE_CTABLE)
1788         /* diagonal part A = c->A */
1789         ncols = ai[Crow+1] - ai[Crow];
1790         cols  = aj + ai[Crow];
1791         vals  = a->a + ai[Crow];
1792         i     = 0;
1793         for (k=0; k<ncols; k++) {
1794           tcol = cmap_loc[cols[k]];
1795           if (tcol) {
1796             subcols[i]   = --tcol;
1797             subvals[i++] = vals[k];
1798           }
1799         }
1800 
1801         /* off-diagonal part B = c->B */
1802         ncols = bi[Crow+1] - bi[Crow];
1803         cols  = bj + bi[Crow];
1804         vals  = b->a + bi[Crow];
1805         for (k=0; k<ncols; k++) {
1806           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1807           if (tcol) {
1808             subcols[i]   = --tcol;
1809             subvals[i++] = vals[k];
1810           }
1811         }
1812 #else
1813         /* diagonal part A = c->A */
1814         ncols = ai[Crow+1] - ai[Crow];
1815         cols  = aj + ai[Crow];
1816         vals  = a->a + ai[Crow];
1817         i     = 0;
1818         for (k=0; k<ncols; k++) {
1819           tcol = cmap[cols[k]+cstart];
1820           if (tcol) {
1821             subcols[i]   = --tcol;
1822             subvals[i++] = vals[k];
1823           }
1824         }
1825 
1826         /* off-diagonal part B = c->B */
1827         ncols = bi[Crow+1] - bi[Crow];
1828         cols  = bj + bi[Crow];
1829         vals  = b->a + bi[Crow];
1830         for (k=0; k<ncols; k++) {
1831           tcol = cmap[bmap[cols[k]]];
1832           if (tcol) {
1833             subcols[i]   = --tcol;
1834             subvals[i++] = vals[k];
1835           }
1836         }
1837 #endif
1838         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1839       }
1840     }
1841   }
1842 
1843   /* Now assemble the off-proc rows */
1844   for (i=0; i<nrqs; i++) { /* for each requested message */
1845     /* recv values from other processes */
1846     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1847     proc    = pa[idex];
1848     sbuf1_i = sbuf1[proc];
1849     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1850     ct1     = 2 + 1;
1851     ct2     = 0; /* count of received C->j */
1852     ct3     = 0; /* count of received C->j that will be inserted into submat */
1853     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1854     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1855     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1856 
1857     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1858     max1 = sbuf1_i[2];             /* num of rows */
1859     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1860       row = sbuf1_i[ct1]; /* row index of submat */
1861       if (!allcolumns) {
1862         idex = 0;
1863         if (scall == MAT_INITIAL_MATRIX) {
1864           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1865           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1866 #if defined(PETSC_USE_CTABLE)
1867             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1868               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1869             } else {
1870               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1871             }
1872 #else
1873             tcol = cmap[rbuf3_i[ct2]];
1874 #endif
1875             if (tcol) {
1876               subcols[idex]   = --tcol;
1877               subvals[idex++] = rbuf4_i[ct2];
1878 
1879               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1880                For reuse, we replace received C->j with index that should be inserted to submat */
1881               rbuf3_i[ct3++] = ct2;
1882             }
1883           }
1884           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1885 
1886         } else { /* scall == MAT_REUSE_MATRIX */
1887           submat = submats[0];
1888           subc   = (Mat_SeqAIJ*)submat->data;
1889 
1890           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1891           for (l=0; l<nnz; l++) {
1892             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1893             subvals[idex++] = rbuf4_i[ct2];
1894           }
1895 
1896           bj = subc->j + subc->i[row];
1897           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1898         }
1899       } else { /* allcolumns */
1900         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1901         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1902         ct2 += nnz;
1903       }
1904     }
1905   }
1906 
1907   /* sending a->a are done */
1908   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1909   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1910 
1911   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1912   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1913   submats[0] = submat;
1914 
1915   /* Restore the indices */
1916   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1917   if (!allcolumns) {
1918     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1919   }
1920 
1921   /* Destroy allocated memory */
1922   for (i=0; i<nrqs; ++i) {
1923     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1924   }
1925   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1926   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1927   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1928 
1929   if (scall == MAT_INITIAL_MATRIX) {
1930     ierr = PetscFree(lens);CHKERRQ(ierr);
1931     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1932     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1933   }
1934   PetscFunctionReturn(0);
1935 }
1936 
1937 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1938 {
1939   PetscErrorCode ierr;
1940   PetscInt       ncol;
1941   PetscBool      colflag,allcolumns=PETSC_FALSE;
1942 
1943   PetscFunctionBegin;
1944   /* Allocate memory to hold all the submatrices */
1945   if (scall == MAT_INITIAL_MATRIX) {
1946     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1947   }
1948 
1949   /* Check for special case: each processor gets entire matrix columns */
1950   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1951   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1952   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1953 
1954   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1955   PetscFunctionReturn(0);
1956 }
1957 
1958 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1959 {
1960   PetscErrorCode ierr;
1961   PetscInt       nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2];
1962   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1963   Mat_SeqAIJ     *subc;
1964   Mat_SubSppt    *smat;
1965 
1966   PetscFunctionBegin;
1967   /* Check for special case: each processor has a single IS */
1968   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1969     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1970     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1971     PetscFunctionReturn(0);
1972   }
1973 
1974   /* Collect global wantallmatrix and nstages */
1975   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1976   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1977   if (!nmax) nmax = 1;
1978 
1979   if (scall == MAT_INITIAL_MATRIX) {
1980     /* Collect global wantallmatrix and nstages */
1981     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1982       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1983       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1984       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1985       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1986       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1987         wantallmatrix = PETSC_TRUE;
1988 
1989         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1990       }
1991     }
1992 
1993     /* Determine the number of stages through which submatrices are done
1994        Each stage will extract nmax submatrices.
1995        nmax is determined by the matrix column dimension.
1996        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1997     */
1998     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1999 
2000     in[0] = -1*(PetscInt)wantallmatrix;
2001     in[1] = nstages;
2002     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2003     wantallmatrix = (PetscBool)(-out[0]);
2004     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2005 
2006   } else { /* MAT_REUSE_MATRIX */
2007     if (ismax) {
2008       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2009       smat = subc->submatis1;
2010     } else { /* (*submat)[0] is a dummy matrix */
2011       smat = (Mat_SubSppt*)(*submat)[0]->data;
2012     }
2013     if (!smat) {
2014       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2015       wantallmatrix = PETSC_TRUE;
2016     } else if (smat->singleis) {
2017       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2018       PetscFunctionReturn(0);
2019     } else {
2020       nstages = smat->nstages;
2021     }
2022   }
2023 
2024   if (wantallmatrix) {
2025     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2026     PetscFunctionReturn(0);
2027   }
2028 
2029   /* Allocate memory to hold all the submatrices and dummy submatrices */
2030   if (scall == MAT_INITIAL_MATRIX) {
2031     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2032   }
2033 
2034   for (i=0,pos=0; i<nstages; i++) {
2035     if (pos+nmax <= ismax) max_no = nmax;
2036     else if (pos == ismax) max_no = 0;
2037     else                   max_no = ismax-pos;
2038 
2039     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2040     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2041       smat = (Mat_SubSppt*)(*submat)[pos]->data;
2042       smat->nstages = nstages;
2043     }
2044     pos += max_no;
2045   }
2046 
2047   if (ismax && scall == MAT_INITIAL_MATRIX) {
2048     /* save nstages for reuse */
2049     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2050     smat = subc->submatis1;
2051     smat->nstages = nstages;
2052   }
2053   PetscFunctionReturn(0);
2054 }
2055 
2056 /* -------------------------------------------------------------------------*/
2057 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2058 {
2059   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2060   Mat            A  = c->A;
2061   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2062   const PetscInt **icol,**irow;
2063   PetscInt       *nrow,*ncol,start;
2064   PetscErrorCode ierr;
2065   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2066   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2067   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2068   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2069   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2070 #if defined(PETSC_USE_CTABLE)
2071   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2072 #else
2073   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2074 #endif
2075   const PetscInt *irow_i;
2076   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2077   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2078   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2079   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2080   MPI_Status     *r_status3,*r_status4,*s_status4;
2081   MPI_Comm       comm;
2082   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2083   PetscMPIInt    *onodes1,*olengths1,end;
2084   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2085   Mat_SubSppt    *smat_i;
2086   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2087   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2088 
2089   PetscFunctionBegin;
2090   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2091   size = c->size;
2092   rank = c->rank;
2093 
2094   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2095   ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2096 
2097   for (i=0; i<ismax; i++) {
2098     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2099     if (!issorted[i]) iscsorted = issorted[i];
2100 
2101     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2102 
2103     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2104     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2105 
2106     /* Check for special case: allcolumn */
2107     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2108     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2109     if (colflag && ncol[i] == C->cmap->N) {
2110       allcolumns[i] = PETSC_TRUE;
2111       icol[i] = NULL;
2112     } else {
2113       allcolumns[i] = PETSC_FALSE;
2114       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2115     }
2116   }
2117 
2118   if (scall == MAT_REUSE_MATRIX) {
2119     /* Assumes new rows are same length as the old rows */
2120     for (i=0; i<ismax; i++) {
2121       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2122       subc = (Mat_SeqAIJ*)submats[i]->data;
2123       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2124 
2125       /* Initial matrix as if empty */
2126       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2127 
2128       smat_i   = subc->submatis1;
2129 
2130       nrqs        = smat_i->nrqs;
2131       nrqr        = smat_i->nrqr;
2132       rbuf1       = smat_i->rbuf1;
2133       rbuf2       = smat_i->rbuf2;
2134       rbuf3       = smat_i->rbuf3;
2135       req_source2 = smat_i->req_source2;
2136 
2137       sbuf1     = smat_i->sbuf1;
2138       sbuf2     = smat_i->sbuf2;
2139       ptr       = smat_i->ptr;
2140       tmp       = smat_i->tmp;
2141       ctr       = smat_i->ctr;
2142 
2143       pa          = smat_i->pa;
2144       req_size    = smat_i->req_size;
2145       req_source1 = smat_i->req_source1;
2146 
2147       allcolumns[i] = smat_i->allcolumns;
2148       row2proc[i]   = smat_i->row2proc;
2149       rmap[i]       = smat_i->rmap;
2150       cmap[i]       = smat_i->cmap;
2151     }
2152 
2153     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2154       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2155       smat_i = (Mat_SubSppt*)submats[0]->data;
2156 
2157       nrqs        = smat_i->nrqs;
2158       nrqr        = smat_i->nrqr;
2159       rbuf1       = smat_i->rbuf1;
2160       rbuf2       = smat_i->rbuf2;
2161       rbuf3       = smat_i->rbuf3;
2162       req_source2 = smat_i->req_source2;
2163 
2164       sbuf1       = smat_i->sbuf1;
2165       sbuf2       = smat_i->sbuf2;
2166       ptr         = smat_i->ptr;
2167       tmp         = smat_i->tmp;
2168       ctr         = smat_i->ctr;
2169 
2170       pa          = smat_i->pa;
2171       req_size    = smat_i->req_size;
2172       req_source1 = smat_i->req_source1;
2173 
2174       allcolumns[0] = PETSC_FALSE;
2175     }
2176   } else { /* scall == MAT_INITIAL_MATRIX */
2177     /* Get some new tags to keep the communication clean */
2178     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2179     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2180 
2181     /* evaluate communication - mesg to who, length of mesg, and buffer space
2182      required. Based on this, buffers are allocated, and data copied into them*/
2183     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2184 
2185     for (i=0; i<ismax; i++) {
2186       jmax   = nrow[i];
2187       irow_i = irow[i];
2188 
2189       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2190       row2proc[i] = row2proc_i;
2191 
2192       if (issorted[i]) proc = 0;
2193       for (j=0; j<jmax; j++) {
2194         if (!issorted[i]) proc = 0;
2195         row = irow_i[j];
2196         while (row >= C->rmap->range[proc+1]) proc++;
2197         w4[proc]++;
2198         row2proc_i[j] = proc; /* map row index to proc */
2199       }
2200       for (j=0; j<size; j++) {
2201         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2202       }
2203     }
2204 
2205     nrqs     = 0;              /* no of outgoing messages */
2206     msz      = 0;              /* total mesg length (for all procs) */
2207     w1[rank] = 0;              /* no mesg sent to self */
2208     w3[rank] = 0;
2209     for (i=0; i<size; i++) {
2210       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2211     }
2212     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2213     for (i=0,j=0; i<size; i++) {
2214       if (w1[i]) { pa[j] = i; j++; }
2215     }
2216 
2217     /* Each message would have a header = 1 + 2*(no of IS) + data */
2218     for (i=0; i<nrqs; i++) {
2219       j      = pa[i];
2220       w1[j] += w2[j] + 2* w3[j];
2221       msz   += w1[j];
2222     }
2223     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2224 
2225     /* Determine the number of messages to expect, their lengths, from from-ids */
2226     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2227     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2228 
2229     /* Now post the Irecvs corresponding to these messages */
2230     tag0 = ((PetscObject)C)->tag;
2231     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2232 
2233     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2234     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2235 
2236     /* Allocate Memory for outgoing messages */
2237     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2238     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2239     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2240 
2241     {
2242       PetscInt *iptr = tmp;
2243       k    = 0;
2244       for (i=0; i<nrqs; i++) {
2245         j        = pa[i];
2246         iptr    += k;
2247         sbuf1[j] = iptr;
2248         k        = w1[j];
2249       }
2250     }
2251 
2252     /* Form the outgoing messages. Initialize the header space */
2253     for (i=0; i<nrqs; i++) {
2254       j           = pa[i];
2255       sbuf1[j][0] = 0;
2256       ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2257       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2258     }
2259 
2260     /* Parse the isrow and copy data into outbuf */
2261     for (i=0; i<ismax; i++) {
2262       row2proc_i = row2proc[i];
2263       ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2264       irow_i = irow[i];
2265       jmax   = nrow[i];
2266       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2267         proc = row2proc_i[j];
2268         if (proc != rank) { /* copy to the outgoing buf*/
2269           ctr[proc]++;
2270           *ptr[proc] = irow_i[j];
2271           ptr[proc]++;
2272         }
2273       }
2274       /* Update the headers for the current IS */
2275       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2276         if ((ctr_j = ctr[j])) {
2277           sbuf1_j        = sbuf1[j];
2278           k              = ++sbuf1_j[0];
2279           sbuf1_j[2*k]   = ctr_j;
2280           sbuf1_j[2*k-1] = i;
2281         }
2282       }
2283     }
2284 
2285     /*  Now  post the sends */
2286     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2287     for (i=0; i<nrqs; ++i) {
2288       j    = pa[i];
2289       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2290     }
2291 
2292     /* Post Receives to capture the buffer size */
2293     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2294     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2295     rbuf2[0] = tmp + msz;
2296     for (i=1; i<nrqs; ++i) {
2297       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2298     }
2299     for (i=0; i<nrqs; ++i) {
2300       j    = pa[i];
2301       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2302     }
2303 
2304     /* Send to other procs the buf size they should allocate */
2305     /* Receive messages*/
2306     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2307     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2308     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2309     {
2310       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2311       PetscInt   *sbuf2_i;
2312 
2313       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2314       for (i=0; i<nrqr; ++i) {
2315         req_size[i] = 0;
2316         rbuf1_i        = rbuf1[i];
2317         start          = 2*rbuf1_i[0] + 1;
2318         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2319         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2320         sbuf2_i        = sbuf2[i];
2321         for (j=start; j<end; j++) {
2322           id              = rbuf1_i[j] - rstart;
2323           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2324           sbuf2_i[j]      = ncols;
2325           req_size[i] += ncols;
2326         }
2327         req_source1[i] = r_status1[i].MPI_SOURCE;
2328         /* form the header */
2329         sbuf2_i[0] = req_size[i];
2330         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2331 
2332         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2333       }
2334     }
2335     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2336     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2337     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2338 
2339     /* Receive messages*/
2340     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2341     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2342 
2343     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2344     for (i=0; i<nrqs; ++i) {
2345       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2346       req_source2[i] = r_status2[i].MPI_SOURCE;
2347       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2348     }
2349     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2350     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2351 
2352     /* Wait on sends1 and sends2 */
2353     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2354     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2355 
2356     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2357     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2358     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2359     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2360     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2361     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2362 
2363     /* Now allocate sending buffers for a->j, and send them off */
2364     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2365     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2366     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2367     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2368 
2369     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2370     {
2371       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2372       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2373       PetscInt cend = C->cmap->rend;
2374       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2375 
2376       for (i=0; i<nrqr; i++) {
2377         rbuf1_i   = rbuf1[i];
2378         sbuf_aj_i = sbuf_aj[i];
2379         ct1       = 2*rbuf1_i[0] + 1;
2380         ct2       = 0;
2381         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2382           kmax = rbuf1[i][2*j];
2383           for (k=0; k<kmax; k++,ct1++) {
2384             row    = rbuf1_i[ct1] - rstart;
2385             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2386             ncols  = nzA + nzB;
2387             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2388 
2389             /* load the column indices for this row into cols */
2390             cols = sbuf_aj_i + ct2;
2391 
2392             lwrite = 0;
2393             for (l=0; l<nzB; l++) {
2394               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2395             }
2396             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2397             for (l=0; l<nzB; l++) {
2398               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2399             }
2400 
2401             ct2 += ncols;
2402           }
2403         }
2404         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2405       }
2406     }
2407     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2408 
2409     /* create col map: global col of C -> local col of submatrices */
2410     {
2411       const PetscInt *icol_i;
2412 #if defined(PETSC_USE_CTABLE)
2413       for (i=0; i<ismax; i++) {
2414         if (!allcolumns[i]) {
2415           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2416 
2417           jmax   = ncol[i];
2418           icol_i = icol[i];
2419           cmap_i = cmap[i];
2420           for (j=0; j<jmax; j++) {
2421             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2422           }
2423         } else cmap[i] = NULL;
2424       }
2425 #else
2426       for (i=0; i<ismax; i++) {
2427         if (!allcolumns[i]) {
2428           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2429           jmax   = ncol[i];
2430           icol_i = icol[i];
2431           cmap_i = cmap[i];
2432           for (j=0; j<jmax; j++) {
2433             cmap_i[icol_i[j]] = j+1;
2434           }
2435         } else cmap[i] = NULL;
2436       }
2437 #endif
2438     }
2439 
2440     /* Create lens which is required for MatCreate... */
2441     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2442     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2443 
2444     if (ismax) {
2445       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2446     }
2447     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2448 
2449     /* Update lens from local data */
2450     for (i=0; i<ismax; i++) {
2451       row2proc_i = row2proc[i];
2452       jmax = nrow[i];
2453       if (!allcolumns[i]) cmap_i = cmap[i];
2454       irow_i = irow[i];
2455       lens_i = lens[i];
2456       for (j=0; j<jmax; j++) {
2457         row = irow_i[j];
2458         proc = row2proc_i[j];
2459         if (proc == rank) {
2460           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2461           if (!allcolumns[i]) {
2462             for (k=0; k<ncols; k++) {
2463 #if defined(PETSC_USE_CTABLE)
2464               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2465 #else
2466               tcol = cmap_i[cols[k]];
2467 #endif
2468               if (tcol) lens_i[j]++;
2469             }
2470           } else { /* allcolumns */
2471             lens_i[j] = ncols;
2472           }
2473           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2474         }
2475       }
2476     }
2477 
2478     /* Create row map: global row of C -> local row of submatrices */
2479 #if defined(PETSC_USE_CTABLE)
2480     for (i=0; i<ismax; i++) {
2481       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2482       irow_i = irow[i];
2483       jmax   = nrow[i];
2484       for (j=0; j<jmax; j++) {
2485       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2486       }
2487     }
2488 #else
2489     for (i=0; i<ismax; i++) {
2490       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2491       rmap_i = rmap[i];
2492       irow_i = irow[i];
2493       jmax   = nrow[i];
2494       for (j=0; j<jmax; j++) {
2495         rmap_i[irow_i[j]] = j;
2496       }
2497     }
2498 #endif
2499 
2500     /* Update lens from offproc data */
2501     {
2502       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2503 
2504       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2505       for (tmp2=0; tmp2<nrqs; tmp2++) {
2506         sbuf1_i = sbuf1[pa[tmp2]];
2507         jmax    = sbuf1_i[0];
2508         ct1     = 2*jmax+1;
2509         ct2     = 0;
2510         rbuf2_i = rbuf2[tmp2];
2511         rbuf3_i = rbuf3[tmp2];
2512         for (j=1; j<=jmax; j++) {
2513           is_no  = sbuf1_i[2*j-1];
2514           max1   = sbuf1_i[2*j];
2515           lens_i = lens[is_no];
2516           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2517           rmap_i = rmap[is_no];
2518           for (k=0; k<max1; k++,ct1++) {
2519 #if defined(PETSC_USE_CTABLE)
2520             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2521             row--;
2522             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2523 #else
2524             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2525 #endif
2526             max2 = rbuf2_i[ct1];
2527             for (l=0; l<max2; l++,ct2++) {
2528               if (!allcolumns[is_no]) {
2529 #if defined(PETSC_USE_CTABLE)
2530                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2531 #else
2532                 tcol = cmap_i[rbuf3_i[ct2]];
2533 #endif
2534                 if (tcol) lens_i[row]++;
2535               } else { /* allcolumns */
2536                 lens_i[row]++; /* lens_i[row] += max2 ? */
2537               }
2538             }
2539           }
2540         }
2541       }
2542     }
2543     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2544     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2545     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2546     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2547 
2548     /* Create the submatrices */
2549     for (i=0; i<ismax; i++) {
2550       PetscInt    rbs,cbs;
2551 
2552       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2553       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2554 
2555       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2556       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2557 
2558       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2559       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2560       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2561 
2562       /* create struct Mat_SubSppt and attached it to submat */
2563       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2564       subc = (Mat_SeqAIJ*)submats[i]->data;
2565       subc->submatis1 = smat_i;
2566 
2567       smat_i->destroy          = submats[i]->ops->destroy;
2568       submats[i]->ops->destroy = MatDestroy_SeqAIJ_Submatrices;
2569       submats[i]->factortype   = C->factortype;
2570 
2571       smat_i->id          = i;
2572       smat_i->nrqs        = nrqs;
2573       smat_i->nrqr        = nrqr;
2574       smat_i->rbuf1       = rbuf1;
2575       smat_i->rbuf2       = rbuf2;
2576       smat_i->rbuf3       = rbuf3;
2577       smat_i->sbuf2       = sbuf2;
2578       smat_i->req_source2 = req_source2;
2579 
2580       smat_i->sbuf1       = sbuf1;
2581       smat_i->ptr         = ptr;
2582       smat_i->tmp         = tmp;
2583       smat_i->ctr         = ctr;
2584 
2585       smat_i->pa           = pa;
2586       smat_i->req_size     = req_size;
2587       smat_i->req_source1  = req_source1;
2588 
2589       smat_i->allcolumns  = allcolumns[i];
2590       smat_i->singleis    = PETSC_FALSE;
2591       smat_i->row2proc    = row2proc[i];
2592       smat_i->rmap        = rmap[i];
2593       smat_i->cmap        = cmap[i];
2594     }
2595 
2596     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2597       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2598       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2599       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2600 
2601       /* create struct Mat_SubSppt and attached it to submat */
2602       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2603       submats[0]->data = (void*)smat_i;
2604 
2605       smat_i->destroy          = submats[0]->ops->destroy;
2606       submats[0]->ops->destroy = MatDestroy_Dummy_Submatrices;
2607       submats[0]->factortype   = C->factortype;
2608 
2609       smat_i->id          = 0;
2610       smat_i->nrqs        = nrqs;
2611       smat_i->nrqr        = nrqr;
2612       smat_i->rbuf1       = rbuf1;
2613       smat_i->rbuf2       = rbuf2;
2614       smat_i->rbuf3       = rbuf3;
2615       smat_i->sbuf2       = sbuf2;
2616       smat_i->req_source2 = req_source2;
2617 
2618       smat_i->sbuf1       = sbuf1;
2619       smat_i->ptr         = ptr;
2620       smat_i->tmp         = tmp;
2621       smat_i->ctr         = ctr;
2622 
2623       smat_i->pa           = pa;
2624       smat_i->req_size     = req_size;
2625       smat_i->req_source1  = req_source1;
2626 
2627       smat_i->allcolumns  = PETSC_FALSE;
2628       smat_i->singleis    = PETSC_FALSE;
2629       smat_i->row2proc    = NULL;
2630       smat_i->rmap        = NULL;
2631       smat_i->cmap        = NULL;
2632     }
2633 
2634     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2635     ierr = PetscFree(lens);CHKERRQ(ierr);
2636     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2637     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2638 
2639   } /* endof scall == MAT_INITIAL_MATRIX */
2640 
2641   /* Post recv matrix values */
2642   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2643   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2644   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2645   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2646   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2647   for (i=0; i<nrqs; ++i) {
2648     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2649     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2650   }
2651 
2652   /* Allocate sending buffers for a->a, and send them off */
2653   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2654   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2655   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2656   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2657 
2658   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2659   {
2660     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2661     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2662     PetscInt    cend   = C->cmap->rend;
2663     PetscInt    *b_j   = b->j;
2664     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2665 
2666     for (i=0; i<nrqr; i++) {
2667       rbuf1_i   = rbuf1[i];
2668       sbuf_aa_i = sbuf_aa[i];
2669       ct1       = 2*rbuf1_i[0]+1;
2670       ct2       = 0;
2671       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2672         kmax = rbuf1_i[2*j];
2673         for (k=0; k<kmax; k++,ct1++) {
2674           row    = rbuf1_i[ct1] - rstart;
2675           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2676           ncols  = nzA + nzB;
2677           cworkB = b_j + b_i[row];
2678           vworkA = a_a + a_i[row];
2679           vworkB = b_a + b_i[row];
2680 
2681           /* load the column values for this row into vals*/
2682           vals = sbuf_aa_i+ct2;
2683 
2684           lwrite = 0;
2685           for (l=0; l<nzB; l++) {
2686             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2687           }
2688           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2689           for (l=0; l<nzB; l++) {
2690             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2691           }
2692 
2693           ct2 += ncols;
2694         }
2695       }
2696       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2697     }
2698   }
2699 
2700   /* Assemble the matrices */
2701   /* First assemble the local rows */
2702   for (i=0; i<ismax; i++) {
2703     row2proc_i = row2proc[i];
2704     subc      = (Mat_SeqAIJ*)submats[i]->data;
2705     imat_ilen = subc->ilen;
2706     imat_j    = subc->j;
2707     imat_i    = subc->i;
2708     imat_a    = subc->a;
2709 
2710     if (!allcolumns[i]) cmap_i = cmap[i];
2711     rmap_i = rmap[i];
2712     irow_i = irow[i];
2713     jmax   = nrow[i];
2714     for (j=0; j<jmax; j++) {
2715       row  = irow_i[j];
2716       proc = row2proc_i[j];
2717       if (proc == rank) {
2718         old_row = row;
2719 #if defined(PETSC_USE_CTABLE)
2720         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2721         row--;
2722 #else
2723         row = rmap_i[row];
2724 #endif
2725         ilen_row = imat_ilen[row];
2726         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2727         mat_i    = imat_i[row];
2728         mat_a    = imat_a + mat_i;
2729         mat_j    = imat_j + mat_i;
2730         if (!allcolumns[i]) {
2731           for (k=0; k<ncols; k++) {
2732 #if defined(PETSC_USE_CTABLE)
2733             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2734 #else
2735             tcol = cmap_i[cols[k]];
2736 #endif
2737             if (tcol) {
2738               *mat_j++ = tcol - 1;
2739               *mat_a++ = vals[k];
2740               ilen_row++;
2741             }
2742           }
2743         } else { /* allcolumns */
2744           for (k=0; k<ncols; k++) {
2745             *mat_j++ = cols[k];  /* global col index! */
2746             *mat_a++ = vals[k];
2747             ilen_row++;
2748           }
2749         }
2750         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2751 
2752         imat_ilen[row] = ilen_row;
2753       }
2754     }
2755   }
2756 
2757   /* Now assemble the off proc rows */
2758   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2759   for (tmp2=0; tmp2<nrqs; tmp2++) {
2760     sbuf1_i = sbuf1[pa[tmp2]];
2761     jmax    = sbuf1_i[0];
2762     ct1     = 2*jmax + 1;
2763     ct2     = 0;
2764     rbuf2_i = rbuf2[tmp2];
2765     rbuf3_i = rbuf3[tmp2];
2766     rbuf4_i = rbuf4[tmp2];
2767     for (j=1; j<=jmax; j++) {
2768       is_no     = sbuf1_i[2*j-1];
2769       rmap_i    = rmap[is_no];
2770       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2771       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2772       imat_ilen = subc->ilen;
2773       imat_j    = subc->j;
2774       imat_i    = subc->i;
2775       imat_a    = subc->a;
2776       max1      = sbuf1_i[2*j];
2777       for (k=0; k<max1; k++,ct1++) {
2778         row = sbuf1_i[ct1];
2779 #if defined(PETSC_USE_CTABLE)
2780         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2781         row--;
2782 #else
2783         row = rmap_i[row];
2784 #endif
2785         ilen  = imat_ilen[row];
2786         mat_i = imat_i[row];
2787         mat_a = imat_a + mat_i;
2788         mat_j = imat_j + mat_i;
2789         max2  = rbuf2_i[ct1];
2790         if (!allcolumns[is_no]) {
2791           for (l=0; l<max2; l++,ct2++) {
2792 #if defined(PETSC_USE_CTABLE)
2793             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2794 #else
2795             tcol = cmap_i[rbuf3_i[ct2]];
2796 #endif
2797             if (tcol) {
2798               *mat_j++ = tcol - 1;
2799               *mat_a++ = rbuf4_i[ct2];
2800               ilen++;
2801             }
2802           }
2803         } else { /* allcolumns */
2804           for (l=0; l<max2; l++,ct2++) {
2805             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2806             *mat_a++ = rbuf4_i[ct2];
2807             ilen++;
2808           }
2809         }
2810         imat_ilen[row] = ilen;
2811       }
2812     }
2813   }
2814 
2815   if (!iscsorted) { /* sort column indices of the rows */
2816     for (i=0; i<ismax; i++) {
2817       subc      = (Mat_SeqAIJ*)submats[i]->data;
2818       imat_j    = subc->j;
2819       imat_i    = subc->i;
2820       imat_a    = subc->a;
2821       imat_ilen = subc->ilen;
2822 
2823       if (allcolumns[i]) continue;
2824       jmax = nrow[i];
2825       for (j=0; j<jmax; j++) {
2826         mat_i = imat_i[j];
2827         mat_a = imat_a + mat_i;
2828         mat_j = imat_j + mat_i;
2829         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2830       }
2831     }
2832   }
2833 
2834   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2835   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2836   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2837   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2838   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2839 
2840   /* Restore the indices */
2841   for (i=0; i<ismax; i++) {
2842     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2843     if (!allcolumns[i]) {
2844       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2845     }
2846   }
2847 
2848   for (i=0; i<ismax; i++) {
2849     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2850     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2851   }
2852 
2853   /* Destroy allocated memory */
2854   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2855   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2856   ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr);
2857 
2858   for (i=0; i<nrqs; ++i) {
2859     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2860   }
2861   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2862 
2863   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2864   PetscFunctionReturn(0);
2865 }
2866 
2867 /*
2868  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2869  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2870  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2871  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2872  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2873  state, and needs to be "assembled" later by compressing B's column space.
2874 
2875  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2876  Following this call, C->A & C->B have been created, even if empty.
2877  */
2878 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2879 {
2880   /* If making this function public, change the error returned in this function away from _PLIB. */
2881   PetscErrorCode ierr;
2882   Mat_MPIAIJ     *aij;
2883   Mat_SeqAIJ     *Baij;
2884   PetscBool      seqaij,Bdisassembled;
2885   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2886   PetscScalar    v;
2887   const PetscInt *rowindices,*colindices;
2888 
2889   PetscFunctionBegin;
2890   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2891   if (A) {
2892     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2893     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2894     if (rowemb) {
2895       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2896       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2897     } else {
2898       if (C->rmap->n != A->rmap->n) {
2899 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2900       }
2901     }
2902     if (dcolemb) {
2903       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2904       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2905     } else {
2906       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2907     }
2908   }
2909   if (B) {
2910     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2911     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2912     if (rowemb) {
2913       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2914       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2915     } else {
2916       if (C->rmap->n != B->rmap->n) {
2917 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2918       }
2919     }
2920     if (ocolemb) {
2921       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2922       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2923     } else {
2924       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2925     }
2926   }
2927 
2928   aij = (Mat_MPIAIJ*)C->data;
2929   if (!aij->A) {
2930     /* Mimic parts of MatMPIAIJSetPreallocation() */
2931     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2932     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2933     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2934     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2935     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2936   }
2937   if (A) {
2938     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2939   } else {
2940     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2941   }
2942   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2943     /*
2944       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2945       need to "disassemble" B -- convert it to using C's global indices.
2946       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2947 
2948       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2949       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2950 
2951       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2952       At least avoid calling MatSetValues() and the implied searches?
2953     */
2954 
2955     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2956 #if defined(PETSC_USE_CTABLE)
2957       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2958 #else
2959       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2960       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2961       if (aij->B) {
2962         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2963       }
2964 #endif
2965       ngcol = 0;
2966       if (aij->lvec) {
2967 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2968       }
2969       if (aij->garray) {
2970 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2971 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2972       }
2973       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2974       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2975     }
2976     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2977       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2978     }
2979     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2980       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2981     }
2982   }
2983   Bdisassembled = PETSC_FALSE;
2984   if (!aij->B) {
2985     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2986     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2987     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2988     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2989     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2990     Bdisassembled = PETSC_TRUE;
2991   }
2992   if (B) {
2993     Baij = (Mat_SeqAIJ*)B->data;
2994     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2995       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2996       for (i=0; i<B->rmap->n; i++) {
2997 	nz[i] = Baij->i[i+1] - Baij->i[i];
2998       }
2999       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3000       ierr = PetscFree(nz);CHKERRQ(ierr);
3001     }
3002 
3003     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3004     shift = rend-rstart;
3005     count = 0;
3006     rowindices = NULL;
3007     colindices = NULL;
3008     if (rowemb) {
3009       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3010     }
3011     if (ocolemb) {
3012       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3013     }
3014     for (i=0; i<B->rmap->n; i++) {
3015       PetscInt row;
3016       row = i;
3017       if (rowindices) row = rowindices[i];
3018       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3019 	col  = Baij->j[count];
3020 	if (colindices) col = colindices[col];
3021 	if (Bdisassembled && col>=rstart) col += shift;
3022 	v    = Baij->a[count];
3023 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3024 	++count;
3025       }
3026     }
3027     /* No assembly for aij->B is necessary. */
3028     /* FIXME: set aij->B's nonzerostate correctly. */
3029   } else {
3030     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3031   }
3032   C->preallocated  = PETSC_TRUE;
3033   C->was_assembled = PETSC_FALSE;
3034   C->assembled     = PETSC_FALSE;
3035    /*
3036       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3037       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3038    */
3039   PetscFunctionReturn(0);
3040 }
3041 
3042 /*
3043   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3044  */
3045 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3046 {
3047   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3048 
3049   PetscFunctionBegin;
3050   PetscValidPointer(A,2);
3051   PetscValidPointer(B,3);
3052   /* FIXME: make sure C is assembled */
3053   *A = aij->A;
3054   *B = aij->B;
3055   /* Note that we don't incref *A and *B, so be careful! */
3056   PetscFunctionReturn(0);
3057 }
3058 
3059 /*
3060   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3061   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3062 */
3063 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3064                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3065 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3066 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3067 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3068 {
3069   PetscErrorCode ierr;
3070   PetscMPIInt    isize,flag;
3071   PetscInt       i,ii,cismax,ispar;
3072   Mat            *A,*B;
3073   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3074 
3075   PetscFunctionBegin;
3076   if (!ismax) PetscFunctionReturn(0);
3077 
3078   for (i = 0, cismax = 0; i < ismax; ++i) {
3079     PetscMPIInt isize;
3080     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3081     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3082     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3083     if (isize > 1) ++cismax;
3084   }
3085 
3086   /*
3087      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3088      ispar counts the number of parallel ISs across C's comm.
3089   */
3090   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3091   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3092     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3093     PetscFunctionReturn(0);
3094   }
3095 
3096   /* if (ispar) */
3097   /*
3098     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3099     These are used to extract the off-diag portion of the resulting parallel matrix.
3100     The row IS for the off-diag portion is the same as for the diag portion,
3101     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3102   */
3103   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3104   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3105   for (i = 0, ii = 0; i < ismax; ++i) {
3106     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3107     if (isize > 1) {
3108       /*
3109 	 TODO: This is the part that's ***NOT SCALABLE***.
3110 	 To fix this we need to extract just the indices of C's nonzero columns
3111 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3112 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3113 	 to be done without serializing on the IS list, so, most likely, it is best
3114 	 done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3115       */
3116       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3117       /* Now we have to
3118 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3119 	     were sorted on each rank, concatenated they might no longer be sorted;
3120 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3121 	     indices in the nondecreasing order to the original index positions.
3122 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3123       */
3124       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3125       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3126       ++ii;
3127     }
3128   }
3129   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3130   for (i = 0, ii = 0; i < ismax; ++i) {
3131     PetscInt       j,issize;
3132     const PetscInt *indices;
3133 
3134     /*
3135        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3136      */
3137     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3138     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3139     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3140     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3141     for (j = 1; j < issize; ++j) {
3142       if (indices[j] == indices[j-1]) {
3143 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3144       }
3145     }
3146     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3147 
3148 
3149     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3150     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3151     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3152     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3153     for (j = 1; j < issize; ++j) {
3154       if (indices[j-1] == indices[j]) {
3155 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3156       }
3157     }
3158     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3159     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3160     if (isize > 1) {
3161       cisrow[ii] = isrow[i];
3162       ++ii;
3163     }
3164   }
3165   /*
3166     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3167     array of sequential matrices underlying the resulting parallel matrices.
3168     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3169     contain duplicates.
3170 
3171     There are as many diag matrices as there are original index sets. There are only as many parallel
3172     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3173 
3174     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3175     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3176       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3177       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3178       with A[i] and B[ii] extracted from the corresponding MPI submat.
3179     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3180       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3181       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3182       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3183       values into A[i] and B[ii] sitting inside the corresponding submat.
3184     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3185       A[i], B[ii], AA[i] or BB[ii] matrices.
3186   */
3187   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3188   if (scall == MAT_INITIAL_MATRIX) {
3189     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3190   }
3191 
3192   /* Now obtain the sequential A and B submatrices separately. */
3193   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3194   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3195   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3196 
3197   /*
3198     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3199     matrices A & B have been extracted directly into the parallel matrices containing them, or
3200     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3201     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3202     to have the same sparsity pattern.
3203     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3204     must be constructed for C. This is done by setseqmat(s).
3205   */
3206   for (i = 0, ii = 0; i < ismax; ++i) {
3207     /*
3208        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3209        That way we can avoid sorting and computing permutations when reusing.
3210        To this end:
3211         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3212 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3213 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3214     */
3215     MatStructure pattern;
3216     pattern = DIFFERENT_NONZERO_PATTERN;
3217 
3218     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3219     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3220     if (isize > 1) {
3221       if (scall == MAT_INITIAL_MATRIX) {
3222 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3223 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3224 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3225 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3226 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3227       }
3228       /*
3229 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3230       */
3231       {
3232 	Mat AA,BB;
3233         AA = A[i];
3234         BB = B[ii];
3235 	if (AA || BB) {
3236 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3237 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3238 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3239 	}
3240 
3241         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3242       }
3243       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3244       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3245       ++ii;
3246     } else { /* if (isize == 1) */
3247       if (scall == MAT_REUSE_MATRIX) {
3248         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3249       }
3250       if (isrow_p[i] || iscol_p[i]) {
3251         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3252         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3253 	/* Otherwise A is extracted straight into (*submats)[i]. */
3254 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3255 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3256       } else (*submat)[i] = A[i];
3257     }
3258     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3259     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3260   }
3261   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3262   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3263   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3264   ierr = PetscFree(A);CHKERRQ(ierr);
3265   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3266   PetscFunctionReturn(0);
3267 }
3268 
3269 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3270 {
3271   PetscErrorCode ierr;
3272 
3273   PetscFunctionBegin;
3274   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3275   PetscFunctionReturn(0);
3276 }
3277