xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 40ad565917cf9ec35102befb4fa1aa0d4e872a47)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode   ierr;
52   MPI_Comm         comm;
53   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt  *indices_i,**indices;
57   PetscLayout      rmap;
58   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF          sf;
60   PetscSFNode     *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1038 /*
1039     Every processor gets the entire matrix
1040 */
1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1042 {
1043   Mat            B;
1044   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1045   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1046   PetscErrorCode ierr;
1047   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1048   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1049   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1050   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1051 
1052   PetscFunctionBegin;
1053   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1054   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1055 
1056   if (scall == MAT_INITIAL_MATRIX) {
1057     /* ----------------------------------------------------------------
1058          Tell every processor the number of nonzeros per row
1059     */
1060     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1061     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1062       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1063     }
1064     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1065     for (i=0; i<size; i++) {
1066       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1067       displs[i]     = A->rmap->range[i];
1068     }
1069 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1070     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1071 #else
1072     sendcount = A->rmap->rend - A->rmap->rstart;
1073     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1074 #endif
1075     /* ---------------------------------------------------------------
1076          Create the sequential matrix of the same type as the local block diagonal
1077     */
1078     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1079     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1080     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1081     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1082     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1083     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1084     **Bin = B;
1085     b     = (Mat_SeqAIJ*)B->data;
1086 
1087     /*--------------------------------------------------------------------
1088        Copy my part of matrix column indices over
1089     */
1090     sendcount  = ad->nz + bd->nz;
1091     jsendbuf   = b->j + b->i[rstarts[rank]];
1092     a_jsendbuf = ad->j;
1093     b_jsendbuf = bd->j;
1094     n          = A->rmap->rend - A->rmap->rstart;
1095     cnt        = 0;
1096     for (i=0; i<n; i++) {
1097 
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     sendcount = ad->nz + bd->nz;
1161     sendbuf   = b->a + b->i[rstarts[rank]];
1162     a_sendbuf = ad->a;
1163     b_sendbuf = bd->a;
1164     b_sendj   = bd->j;
1165     n         = A->rmap->rend - A->rmap->rstart;
1166     cnt       = 0;
1167     for (i=0; i<n; i++) {
1168 
1169       /* put in lower diagonal portion */
1170       m = bd->i[i+1] - bd->i[i];
1171       while (m > 0) {
1172         /* is it above diagonal (in bd (compressed) numbering) */
1173         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1174         sendbuf[cnt++] = *b_sendbuf++;
1175         m--;
1176         b_sendj++;
1177       }
1178 
1179       /* put in diagonal portion */
1180       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1181         sendbuf[cnt++] = *a_sendbuf++;
1182       }
1183 
1184       /* put in upper diagonal portion */
1185       while (m-- > 0) {
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         b_sendj++;
1188       }
1189     }
1190     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1191 
1192     /* -----------------------------------------------------------------
1193        Gather all numerical values to all processors
1194     */
1195     if (!recvcounts) {
1196       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1197     }
1198     for (i=0; i<size; i++) {
1199       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1200     }
1201     displs[0] = 0;
1202     for (i=1; i<size; i++) {
1203       displs[i] = displs[i-1] + recvcounts[i-1];
1204     }
1205     recvbuf = b->a;
1206 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1207     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1208 #else
1209     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1210 #endif
1211   }  /* endof (flag == MAT_GET_VALUES) */
1212   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1213 
1214   if (A->symmetric) {
1215     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1216   } else if (A->hermitian) {
1217     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1218   } else if (A->structurally_symmetric) {
1219     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1220   }
1221   PetscFunctionReturn(0);
1222 }
1223 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1224 {
1225   PetscErrorCode ierr;
1226   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1227   Mat_SubMat     *submatj = c->submatis1;
1228   PetscInt       i;
1229 
1230   PetscFunctionBegin;
1231   if (submatj->id) PetscFunctionReturn(0);
1232 
1233   ierr = submatj->destroy(C);CHKERRQ(ierr);
1234 
1235   ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1236 
1237   for (i=0; i<submatj->nrqr; ++i) {
1238     ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1239   }
1240   ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1241 
1242   if (submatj->rbuf1) {
1243     ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1244     ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1245   }
1246 
1247   for (i=0; i<submatj->nrqs; ++i) {
1248     ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1249   }
1250   ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1251   ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1252 
1253   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1254 
1255 #if defined(PETSC_USE_CTABLE)
1256   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1257   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1258   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1259 #else
1260   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1261 #endif
1262 
1263   if (!submatj->allcolumns) {
1264 #if defined(PETSC_USE_CTABLE)
1265     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1266 #else
1267     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1268 #endif
1269   }
1270 
1271   ierr = PetscFree(submatj);CHKERRQ(ierr);
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1276 {
1277   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1278   Mat            submat,A = c->A,B = c->B;
1279   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1280   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1281   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1282   const PetscInt *icol,*irow;
1283   PetscInt       nrow,ncol,start;
1284   PetscErrorCode ierr;
1285   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1286   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1287   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1288   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1289   PetscInt       *lens,rmax,ncols,*cols,Crow;
1290 #if defined(PETSC_USE_CTABLE)
1291   PetscTable     cmap,rmap;
1292   PetscInt       *cmap_loc,*rmap_loc;
1293 #else
1294   PetscInt       *cmap,*rmap;
1295 #endif
1296   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1297   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1298   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1299   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1300   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1301   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1302   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1303   MPI_Comm       comm;
1304   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1305   PetscMPIInt    *onodes1,*olengths1,idex,end;
1306   Mat_SubMat     *smatis1;
1307   PetscBool      isrowsorted;
1308 
1309   PetscFunctionBegin;
1310   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1311 
1312   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1313   size = c->size;
1314   rank = c->rank;
1315 
1316   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1317   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1318 
1319   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1320   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1321   if (allcolumns) {
1322     icol = NULL;
1323     ncol = C->cmap->N;
1324   } else {
1325     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1326     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1327   }
1328 
1329   if (scall == MAT_INITIAL_MATRIX) {
1330     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1331 
1332     /* Get some new tags to keep the communication clean */
1333     tag1 = ((PetscObject)C)->tag;
1334     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1335     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1336 
1337     /* evaluate communication - mesg to who, length of mesg, and buffer space
1338      required. Based on this, buffers are allocated, and data copied into them */
1339     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1340     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1341 
1342     /* w1[proc] = num of rows owned by proc -- to be requested */
1343     proc = 0;
1344     nrqs = 0; /* num of outgoing messages */
1345     for (j=0; j<nrow; j++) {
1346       row  = irow[j]; /* sorted! */
1347       while (row >= C->rmap->range[proc+1]) proc++;
1348       w1[proc]++;
1349       row2proc[j] = proc; /* map row index to proc */
1350 
1351       if (proc != rank && !w2[proc]) {
1352         w2[proc] = 1; nrqs++;
1353       }
1354     }
1355     w1[rank] = 0;  /* rows owned by self will not be requested */
1356 
1357     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1358     for (proc=0,j=0; proc<size; proc++) {
1359       if (w1[proc]) { pa[j++] = proc;}
1360     }
1361 
1362     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1363     msz = 0;              /* total mesg length (for all procs) */
1364     for (i=0; i<nrqs; i++) {
1365       proc      = pa[i];
1366       w1[proc] += 3;
1367       msz      += w1[proc];
1368     }
1369     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1370 
1371     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1372     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1373     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1374 
1375     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1376        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1377     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1378 
1379     /* Now post the Irecvs corresponding to these messages */
1380     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1381 
1382     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1383     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1384 
1385     /* Allocate Memory for outgoing messages */
1386     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1387     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1388     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1389 
1390     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1391     iptr = tmp;
1392     for (i=0; i<nrqs; i++) {
1393       proc        = pa[i];
1394       sbuf1[proc] = iptr;
1395       iptr       += w1[proc];
1396     }
1397 
1398     /* Form the outgoing messages */
1399     /* Initialize the header space */
1400     for (i=0; i<nrqs; i++) {
1401       proc      = pa[i];
1402       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1403       ptr[proc] = sbuf1[proc] + 3;
1404     }
1405 
1406     /* Parse the isrow and copy data into outbuf */
1407     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1408     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1409       proc = row2proc[j];
1410       if (proc != rank) { /* copy to the outgoing buf*/
1411         *ptr[proc] = irow[j];
1412         ctr[proc]++; ptr[proc]++;
1413       }
1414     }
1415 
1416     /* Update the headers for the current IS */
1417     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1418       if ((ctr_j = ctr[j])) {
1419         sbuf1_j        = sbuf1[j];
1420         k              = ++sbuf1_j[0];
1421         sbuf1_j[2*k]   = ctr_j;
1422         sbuf1_j[2*k-1] = 0;
1423       }
1424     }
1425 
1426     /* Now post the sends */
1427     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1428     for (i=0; i<nrqs; ++i) {
1429       proc = pa[i];
1430       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1431     }
1432 
1433     /* Post Receives to capture the buffer size */
1434     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1435     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1436 
1437     rbuf2[0] = tmp + msz;
1438     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1439 
1440     for (i=0; i<nrqs; ++i) {
1441       proc = pa[i];
1442       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1443     }
1444 
1445     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1446 
1447     /* Send to other procs the buf size they should allocate */
1448     /* Receive messages*/
1449     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1450     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1451 
1452     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1453     for (i=0; i<nrqr; ++i) {
1454       req_size[i] = 0;
1455       rbuf1_i        = rbuf1[i];
1456       start          = 2*rbuf1_i[0] + 1;
1457       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1458       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1459       sbuf2_i        = sbuf2[i];
1460       for (j=start; j<end; j++) {
1461         k            = rbuf1_i[j] - rstart;
1462         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1463         sbuf2_i[j]   = ncols;
1464         req_size[i] += ncols;
1465       }
1466       req_source1[i] = r_status1[i].MPI_SOURCE;
1467 
1468       /* form the header */
1469       sbuf2_i[0] = req_size[i];
1470       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1471 
1472       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1473     }
1474 
1475     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1476     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1477 
1478     /* rbuf2 is received, Post recv column indices a->j */
1479     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1480 
1481     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1482     for (i=0; i<nrqs; ++i) {
1483       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1484       req_source2[i] = r_status2[i].MPI_SOURCE;
1485       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1486     }
1487 
1488     /* Wait on sends1 and sends2 */
1489     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1490     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1491     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1492     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1493 
1494     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1495     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1496 
1497     /* Now allocate sending buffers for a->j, and send them off */
1498     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1499     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1500     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1501     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1502 
1503     for (i=0; i<nrqr; i++) { /* for each requested message */
1504       rbuf1_i   = rbuf1[i];
1505       sbuf_aj_i = sbuf_aj[i];
1506       ct1       = 2*rbuf1_i[0] + 1;
1507       ct2       = 0;
1508       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1509 
1510       kmax = rbuf1[i][2];
1511       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1512         row    = rbuf1_i[ct1] - rstart;
1513         nzA    = ai[row+1] - ai[row];
1514         nzB    = bi[row+1] - bi[row];
1515         ncols  = nzA + nzB;
1516         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1517 
1518         /* load the column indices for this row into cols*/
1519         cols = sbuf_aj_i + ct2;
1520 
1521         lwrite = 0;
1522         for (l=0; l<nzB; l++) {
1523           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1524         }
1525         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1526         for (l=0; l<nzB; l++) {
1527           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1528         }
1529 
1530         ct2 += ncols;
1531       }
1532       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1533     }
1534 
1535     /* create column map (cmap): global col of C -> local col of submat */
1536 #if defined(PETSC_USE_CTABLE)
1537     if (!allcolumns) {
1538       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1539       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1540       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1541         if (icol[j] >= cstart && icol[j] <cend) {
1542           cmap_loc[icol[j] - cstart] = j+1;
1543         } else { /* use PetscTable for non-local col indices */
1544           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1545         }
1546       }
1547     } else {
1548       cmap     = NULL;
1549       cmap_loc = NULL;
1550     }
1551     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1552 #else
1553     if (!allcolumns) {
1554       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1555       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1556     } else {
1557       cmap = NULL;
1558     }
1559 #endif
1560 
1561     /* Create lens for MatSeqAIJSetPreallocation() */
1562     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1563 
1564     /* Compute lens from local part of C */
1565     for (j=0; j<nrow; j++) {
1566       row  = irow[j];
1567       proc = row2proc[j];
1568       if (proc == rank) {
1569         /* diagonal part A = c->A */
1570         ncols = ai[row-rstart+1] - ai[row-rstart];
1571         cols  = aj + ai[row-rstart];
1572         if (!allcolumns) {
1573           for (k=0; k<ncols; k++) {
1574 #if defined(PETSC_USE_CTABLE)
1575             tcol = cmap_loc[cols[k]];
1576 #else
1577             tcol = cmap[cols[k]+cstart];
1578 #endif
1579             if (tcol) lens[j]++;
1580           }
1581         } else { /* allcolumns */
1582           lens[j] = ncols;
1583         }
1584 
1585         /* off-diagonal part B = c->B */
1586         ncols = bi[row-rstart+1] - bi[row-rstart];
1587         cols  = bj + bi[row-rstart];
1588         if (!allcolumns) {
1589           for (k=0; k<ncols; k++) {
1590 #if defined(PETSC_USE_CTABLE)
1591             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1592 #else
1593             tcol = cmap[bmap[cols[k]]];
1594 #endif
1595             if (tcol) lens[j]++;
1596           }
1597         } else { /* allcolumns */
1598           lens[j] += ncols;
1599         }
1600       }
1601     }
1602 
1603     /* Create row map (rmap): global row of C -> local row of submat */
1604 #if defined(PETSC_USE_CTABLE)
1605     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1606     for (j=0; j<nrow; j++) {
1607       row  = irow[j];
1608       proc = row2proc[j];
1609       if (proc == rank) { /* a local row */
1610         rmap_loc[row - rstart] = j;
1611       } else {
1612         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1613       }
1614     }
1615 #else
1616     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1617     for (j=0; j<nrow; j++) {
1618       rmap[irow[j]] = j;
1619     }
1620 #endif
1621 
1622     /* Update lens from offproc data */
1623     /* recv a->j is done */
1624     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1625     for (i=0; i<nrqs; i++) {
1626       proc    = pa[i];
1627       sbuf1_i = sbuf1[proc];
1628       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1629       ct1     = 2 + 1;
1630       ct2     = 0;
1631       rbuf2_i = rbuf2[i]; /* received length of C->j */
1632       rbuf3_i = rbuf3[i]; /* received C->j */
1633 
1634       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1635       max1   = sbuf1_i[2];
1636       for (k=0; k<max1; k++,ct1++) {
1637 #if defined(PETSC_USE_CTABLE)
1638         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1639         row--;
1640         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1641 #else
1642         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1643 #endif
1644         /* Now, store row index of submat in sbuf1_i[ct1] */
1645         sbuf1_i[ct1] = row;
1646 
1647         nnz = rbuf2_i[ct1];
1648         if (!allcolumns) {
1649           for (l=0; l<nnz; l++,ct2++) {
1650 #if defined(PETSC_USE_CTABLE)
1651             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1652               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1653             } else {
1654               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1655             }
1656 #else
1657             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1658 #endif
1659             if (tcol) lens[row]++;
1660           }
1661         } else { /* allcolumns */
1662           lens[row] += nnz;
1663         }
1664       }
1665     }
1666     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1667     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1668 
1669     /* Create the submatrices */
1670     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1671     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1672 
1673     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1674     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1675     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1676     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1677     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1678 
1679     /* create struct Mat_SubMat and attached it to submat */
1680     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1681     subc = (Mat_SeqAIJ*)submat->data;
1682     subc->submatis1 = smatis1;
1683 
1684     smatis1->id          = 0;
1685     smatis1->nrqs        = nrqs;
1686     smatis1->nrqr        = nrqr;
1687     smatis1->rbuf1       = rbuf1;
1688     smatis1->rbuf2       = rbuf2;
1689     smatis1->rbuf3       = rbuf3;
1690     smatis1->sbuf2       = sbuf2;
1691     smatis1->req_source2 = req_source2;
1692 
1693     smatis1->sbuf1       = sbuf1;
1694     smatis1->ptr         = ptr;
1695     smatis1->tmp         = tmp;
1696     smatis1->ctr         = ctr;
1697 
1698     smatis1->pa           = pa;
1699     smatis1->req_size     = req_size;
1700     smatis1->req_source1  = req_source1;
1701 
1702     smatis1->allcolumns  = allcolumns;
1703     smatis1->row2proc    = row2proc;
1704     smatis1->rmap        = rmap;
1705     smatis1->cmap        = cmap;
1706 #if defined(PETSC_USE_CTABLE)
1707     smatis1->rmap_loc    = rmap_loc;
1708     smatis1->cmap_loc    = cmap_loc;
1709 #endif
1710 
1711     smatis1->destroy     = submat->ops->destroy;
1712     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1713     submat->factortype   = C->factortype;
1714 
1715     /* compute rmax */
1716     rmax = 0;
1717     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1718 
1719   } else { /* scall == MAT_REUSE_MATRIX */
1720     submat = submats[0];
1721     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1722 
1723     subc    = (Mat_SeqAIJ*)submat->data;
1724     rmax    = subc->rmax;
1725     smatis1 = subc->submatis1;
1726     nrqs        = smatis1->nrqs;
1727     nrqr        = smatis1->nrqr;
1728     rbuf1       = smatis1->rbuf1;
1729     rbuf2       = smatis1->rbuf2;
1730     rbuf3       = smatis1->rbuf3;
1731     req_source2 = smatis1->req_source2;
1732 
1733     sbuf1     = smatis1->sbuf1;
1734     sbuf2     = smatis1->sbuf2;
1735     ptr       = smatis1->ptr;
1736     tmp       = smatis1->tmp;
1737     ctr       = smatis1->ctr;
1738 
1739     pa         = smatis1->pa;
1740     req_size   = smatis1->req_size;
1741     req_source1 = smatis1->req_source1;
1742 
1743     allcolumns = smatis1->allcolumns;
1744     row2proc   = smatis1->row2proc;
1745     rmap       = smatis1->rmap;
1746     cmap       = smatis1->cmap;
1747 #if defined(PETSC_USE_CTABLE)
1748     rmap_loc   = smatis1->rmap_loc;
1749     cmap_loc   = smatis1->cmap_loc;
1750 #endif
1751   }
1752 
1753   /* Post recv matrix values */
1754   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1755   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1756   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1757   for (i=0; i<nrqs; ++i) {
1758     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1759     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1760   }
1761 
1762   /* Allocate sending buffers for a->a, and send them off */
1763   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1764   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1765   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1766   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1767 
1768   for (i=0; i<nrqr; i++) {
1769     rbuf1_i   = rbuf1[i];
1770     sbuf_aa_i = sbuf_aa[i];
1771     ct1       = 2*rbuf1_i[0]+1;
1772     ct2       = 0;
1773     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1774 
1775     kmax = rbuf1_i[2];
1776     for (k=0; k<kmax; k++,ct1++) {
1777       row = rbuf1_i[ct1] - rstart;
1778       nzA = ai[row+1] - ai[row];
1779       nzB = bi[row+1] - bi[row];
1780       ncols  = nzA + nzB;
1781       cworkB = bj + bi[row];
1782       vworkA = a_a + ai[row];
1783       vworkB = b_a + bi[row];
1784 
1785       /* load the column values for this row into vals*/
1786       vals = sbuf_aa_i + ct2;
1787 
1788       lwrite = 0;
1789       for (l=0; l<nzB; l++) {
1790         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1791       }
1792       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1793       for (l=0; l<nzB; l++) {
1794         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1795       }
1796 
1797       ct2 += ncols;
1798     }
1799     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1800   }
1801 
1802   /* Assemble submat */
1803   /* First assemble the local rows */
1804   for (j=0; j<nrow; j++) {
1805     row  = irow[j];
1806     proc = row2proc[j];
1807     if (proc == rank) {
1808       Crow = row - rstart;  /* local row index of C */
1809 #if defined(PETSC_USE_CTABLE)
1810       row = rmap_loc[Crow]; /* row index of submat */
1811 #else
1812       row = rmap[row];
1813 #endif
1814 
1815       if (allcolumns) {
1816         /* diagonal part A = c->A */
1817         ncols = ai[Crow+1] - ai[Crow];
1818         cols  = aj + ai[Crow];
1819         vals  = a->a + ai[Crow];
1820         i     = 0;
1821         for (k=0; k<ncols; k++) {
1822           subcols[i]   = cols[k] + cstart;
1823           subvals[i++] = vals[k];
1824         }
1825 
1826         /* off-diagonal part B = c->B */
1827         ncols = bi[Crow+1] - bi[Crow];
1828         cols  = bj + bi[Crow];
1829         vals  = b->a + bi[Crow];
1830         for (k=0; k<ncols; k++) {
1831           subcols[i]   = bmap[cols[k]];
1832           subvals[i++] = vals[k];
1833         }
1834 
1835         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1836 
1837       } else { /* !allcolumns */
1838 #if defined(PETSC_USE_CTABLE)
1839         /* diagonal part A = c->A */
1840         ncols = ai[Crow+1] - ai[Crow];
1841         cols  = aj + ai[Crow];
1842         vals  = a->a + ai[Crow];
1843         i     = 0;
1844         for (k=0; k<ncols; k++) {
1845           tcol = cmap_loc[cols[k]];
1846           if (tcol) {
1847             subcols[i]   = --tcol;
1848             subvals[i++] = vals[k];
1849           }
1850         }
1851 
1852         /* off-diagonal part B = c->B */
1853         ncols = bi[Crow+1] - bi[Crow];
1854         cols  = bj + bi[Crow];
1855         vals  = b->a + bi[Crow];
1856         for (k=0; k<ncols; k++) {
1857           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1858           if (tcol) {
1859             subcols[i]   = --tcol;
1860             subvals[i++] = vals[k];
1861           }
1862         }
1863 #else
1864         /* diagonal part A = c->A */
1865         ncols = ai[Crow+1] - ai[Crow];
1866         cols  = aj + ai[Crow];
1867         vals  = a->a + ai[Crow];
1868         i     = 0;
1869         for (k=0; k<ncols; k++) {
1870           tcol = cmap[cols[k]+cstart];
1871           if (tcol) {
1872             subcols[i]   = --tcol;
1873             subvals[i++] = vals[k];
1874           }
1875         }
1876 
1877         /* off-diagonal part B = c->B */
1878         ncols = bi[Crow+1] - bi[Crow];
1879         cols  = bj + bi[Crow];
1880         vals  = b->a + bi[Crow];
1881         for (k=0; k<ncols; k++) {
1882           tcol = cmap[bmap[cols[k]]];
1883           if (tcol) {
1884             subcols[i]   = --tcol;
1885             subvals[i++] = vals[k];
1886           }
1887         }
1888 #endif
1889         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1890       }
1891     }
1892   }
1893 
1894   /* Now assemble the off-proc rows */
1895   for (i=0; i<nrqs; i++) { /* for each requested message */
1896     /* recv values from other processes */
1897     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1898     proc    = pa[idex];
1899     sbuf1_i = sbuf1[proc];
1900     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1901     ct1     = 2 + 1;
1902     ct2     = 0; /* count of received C->j */
1903     ct3     = 0; /* count of received C->j that will be inserted into submat */
1904     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1905     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1906     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1907 
1908     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1909     max1 = sbuf1_i[2];             /* num of rows */
1910     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1911       row = sbuf1_i[ct1]; /* row index of submat */
1912       if (!allcolumns) {
1913         idex = 0;
1914         if (scall == MAT_INITIAL_MATRIX) {
1915           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1916           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1917 #if defined(PETSC_USE_CTABLE)
1918             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1919               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1920             } else {
1921               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1922             }
1923 #else
1924             tcol = cmap[rbuf3_i[ct2]];
1925 #endif
1926             if (tcol) {
1927               subcols[idex]   = --tcol;
1928               subvals[idex++] = rbuf4_i[ct2];
1929 
1930               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1931                For reuse, we replace received C->j with index that should be inserted to submat */
1932               rbuf3_i[ct3++] = ct2;
1933             }
1934           }
1935           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1936 
1937         } else { /* scall == MAT_REUSE_MATRIX */
1938           submat = submats[0];
1939           subc   = (Mat_SeqAIJ*)submat->data;
1940 
1941           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1942           for (l=0; l<nnz; l++) {
1943             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1944             subvals[idex++] = rbuf4_i[ct2];
1945           }
1946 
1947           bj = subc->j + subc->i[row];
1948           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1949         }
1950       } else { /* allcolumns */
1951         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1952         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1953         ct2 += nnz;
1954       }
1955     }
1956   }
1957 
1958   /* sending a->a are done */
1959   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1960   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1961 
1962   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1963   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1964   submats[0] = submat;
1965 
1966   /* Restore the indices */
1967   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1968   if (!allcolumns) {
1969     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1970   }
1971 
1972   /* Destroy allocated memory */
1973   for (i=0; i<nrqs; ++i) {
1974     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1975   }
1976   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1977   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1978   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1979 
1980   if (scall == MAT_INITIAL_MATRIX) {
1981     ierr = PetscFree(lens);CHKERRQ(ierr);
1982     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1983     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1984   }
1985   PetscFunctionReturn(0);
1986 }
1987 
1988 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1989 {
1990   PetscErrorCode ierr;
1991   PetscInt       ncol;
1992   PetscBool      colflag,allcolumns=PETSC_FALSE;
1993 
1994   PetscFunctionBegin;
1995   /* Allocate memory to hold all the submatrices */
1996   if (scall == MAT_INITIAL_MATRIX) {
1997     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
1998   }
1999 
2000   /* Check for special case: each processor gets entire matrix columns */
2001   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
2002   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2003   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2004 
2005   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2006   PetscFunctionReturn(0);
2007 }
2008 
2009 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2010 {
2011   PetscErrorCode ierr;
2012   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
2013   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
2014 
2015   PetscFunctionBegin;
2016 #if 0
2017   /* Check for special case: each processor gets entire matrix */
2018   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */
2019     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2020     PetscFunctionReturn(0);
2021   }
2022 #endif
2023 
2024   if (ismax == 1 && C->rmap->N == C->cmap->N) {
2025     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2026     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2027     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2028     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2029     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2030       wantallmatrix = PETSC_TRUE;
2031 
2032       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2033     }
2034   }
2035   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2036   if (twantallmatrix) {
2037     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2038     PetscFunctionReturn(0);
2039   }
2040 
2041   /* Allocate memory to hold all the submatrices */
2042   if (scall == MAT_INITIAL_MATRIX) {
2043     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
2044   }
2045 
2046   /* Check for special case: each processor gets entire matrix columns */
2047   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
2048   for (i=0; i<ismax; i++) {
2049     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2050     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
2051     if (colflag && ncol == C->cmap->N) {
2052       allcolumns[i] = PETSC_TRUE;
2053     } else {
2054       allcolumns[i] = PETSC_FALSE;
2055     }
2056   }
2057 
2058   /* Determine the number of stages through which submatrices are done */
2059   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2060 
2061   /*
2062      Each stage will extract nmax submatrices.
2063      nmax is determined by the matrix column dimension.
2064      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2065   */
2066   if (!nmax) nmax = 1;
2067   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
2068 
2069   /* Make sure every processor loops through the nstages */
2070   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2071 
2072   for (i=0,pos=0; i<nstages; i++) {
2073     if (pos+nmax <= ismax) max_no = nmax;
2074     else if (pos == ismax) max_no = 0;
2075     else                   max_no = ismax-pos;
2076     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
2077     pos += max_no;
2078   }
2079 
2080   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
2081   PetscFunctionReturn(0);
2082 }
2083 
2084 /* -------------------------------------------------------------------------*/
2085 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
2086 {
2087   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2088   Mat            A  = c->A;
2089   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
2090   const PetscInt **icol,**irow;
2091   PetscInt       *nrow,*ncol,start;
2092   PetscErrorCode ierr;
2093   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
2094   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
2095   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
2096   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,max2;
2097   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2098 #if defined(PETSC_USE_CTABLE)
2099   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2100 #else
2101   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2102 #endif
2103   const PetscInt *irow_i;
2104   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2105   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2106   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2107   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2108   MPI_Status     *r_status3,*r_status4,*s_status4;
2109   MPI_Comm       comm;
2110   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
2111   PetscMPIInt    *onodes1,*olengths1;
2112   PetscMPIInt    idex,idex2,end;
2113   PetscInt       **row2proc,*row2proc_i;
2114   Mat_SubMat     **smats,*smat_i;
2115   PetscBool      *issorted;
2116 
2117   PetscFunctionBegin;
2118   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2119   tag0 = ((PetscObject)C)->tag;
2120   size = c->size;
2121   rank = c->rank;
2122   if (ismax == 0) printf("[%d] MatGetSubMatrices_MPIAIJ_Local, scall %d, ismax = 0!!! \n",rank,scall);
2123   //printf("[%d] MatGetSubMatrices_MPIAIJ_Local, scall %d\n",rank,scall);
2124 
2125   /* Get some new tags to keep the communication clean */
2126   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
2127   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2128   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2129 
2130   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
2131   ierr = PetscMalloc1(ismax,&issorted);CHKERRQ(ierr);
2132 
2133   for (i=0; i<ismax; i++) {
2134     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2135 
2136     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2137     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2138     if (allcolumns[i]) {
2139       icol[i] = NULL;
2140       ncol[i] = C->cmap->N;
2141     } else {
2142       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2143       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2144     }
2145   }
2146 
2147 
2148   ierr = PetscMalloc1(ismax,&smats);CHKERRQ(ierr);
2149   ierr = PetscMalloc1(ismax,&row2proc);CHKERRQ(ierr);
2150   ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
2151   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
2152 
2153   if (scall == MAT_REUSE_MATRIX) {
2154     //PetscInt  *req_source1,*req_source2;
2155 
2156     /* Assumes new rows are same length as the old rows,hence bug! */
2157     for (i=0; i<ismax; i++) {
2158       mat = (Mat_SeqAIJ*)(submats[i]->data);
2159       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2160 
2161       /* Initial matrix as if empty -- rm later!!! */
2162       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2163 
2164       /* Initial matrix as if empty */
2165       submats[i]->factortype = C->factortype;
2166 
2167       smat_i   = mat->submatis1;
2168       smats[i] = smat_i;
2169 
2170       nrqs        = smat_i->nrqs;
2171       nrqr        = smat_i->nrqr;
2172       rbuf1       = smat_i->rbuf1;
2173       rbuf2       = smat_i->rbuf2;
2174       rbuf3       = smat_i->rbuf3;
2175       req_source2 = smat_i->req_source2;
2176 
2177       sbuf1     = smat_i->sbuf1;
2178       sbuf2     = smat_i->sbuf2;
2179       ptr       = smat_i->ptr;
2180       tmp       = smat_i->tmp;
2181       ctr       = smat_i->ctr;
2182 
2183       pa          = smat_i->pa;
2184       req_size    = smat_i->req_size;
2185       req_source1 = smat_i->req_source1;
2186 
2187       allcolumns[i] = smat_i->allcolumns;
2188       row2proc[i]   = smat_i->row2proc;
2189       rmap[i]       = smat_i->rmap;
2190       cmap[i]       = smat_i->cmap;
2191 #if defined(PETSC_USE_CTABLE)
2192       //rmap_loc   = smat_i->rmap_loc;
2193       //cmap_loc   = smat_i->cmap_loc;
2194 #endif
2195     }
2196   }
2197 
2198   if (scall == MAT_INITIAL_MATRIX ||!ismax) {
2199 
2200   /* evaluate communication - mesg to who, length of mesg, and buffer space
2201      required. Based on this, buffers are allocated, and data copied into them*/
2202   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
2203   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2204   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2205   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2206 
2207   for (i=0; i<ismax; i++) {
2208     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
2209     jmax   = nrow[i];
2210     irow_i = irow[i];
2211 
2212     if (scall == MAT_INITIAL_MATRIX) {
2213       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2214       row2proc[i] = row2proc_i;
2215     } else {
2216       row2proc_i = row2proc[i];
2217     }
2218 
2219     if (issorted[i]) proc = 0;
2220     for (j=0; j<jmax; j++) {
2221       if (!issorted[i]) proc = 0;
2222       row = irow_i[j];
2223       while (row >= C->rmap->range[proc+1]) proc++;
2224       w4[proc]++;
2225       row2proc_i[j] = proc; /* map row index to proc */
2226     }
2227     for (j=0; j<size; j++) {
2228       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
2229     }
2230   }
2231 
2232   nrqs     = 0;              /* no of outgoing messages */
2233   msz      = 0;              /* total mesg length (for all procs) */
2234   w1[rank] = 0;              /* no mesg sent to self */
2235   w3[rank] = 0;
2236   for (i=0; i<size; i++) {
2237     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2238   }
2239   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2240   for (i=0,j=0; i<size; i++) {
2241     if (w1[i]) { pa[j] = i; j++; }
2242   }
2243 
2244   /* Each message would have a header = 1 + 2*(no of IS) + data */
2245   for (i=0; i<nrqs; i++) {
2246     j      = pa[i];
2247     w1[j] += w2[j] + 2* w3[j];
2248     msz   += w1[j];
2249   }
2250   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2251 
2252   /* Determine the number of messages to expect, their lengths, from from-ids */
2253   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2254   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2255 
2256   /* Now post the Irecvs corresponding to these messages */
2257   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2258 
2259   ierr = PetscFree(onodes1);CHKERRQ(ierr);
2260   ierr = PetscFree(olengths1);CHKERRQ(ierr);
2261 
2262   /* Allocate Memory for outgoing messages */
2263   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2264   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2265   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2266 
2267   {
2268     PetscInt *iptr = tmp;
2269     k    = 0;
2270     for (i=0; i<nrqs; i++) {
2271       j        = pa[i];
2272       iptr    += k;
2273       sbuf1[j] = iptr;
2274       k        = w1[j];
2275     }
2276   }
2277 
2278   /* Form the outgoing messages */
2279   /* Initialize the header space */
2280   for (i=0; i<nrqs; i++) {
2281     j           = pa[i];
2282     sbuf1[j][0] = 0;
2283     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2284     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2285   }
2286 
2287   /* Parse the isrow and copy data into outbuf */
2288   for (i=0; i<ismax; i++) {
2289     row2proc_i = row2proc[i];
2290     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2291     irow_i = irow[i];
2292     jmax   = nrow[i];
2293     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2294       proc = row2proc_i[j];
2295       if (proc != rank) { /* copy to the outgoing buf*/
2296         ctr[proc]++;
2297         *ptr[proc] = irow_i[j];
2298         ptr[proc]++;
2299       }
2300     }
2301     /* Update the headers for the current IS */
2302     for (j=0; j<size; j++) { /* Can Optimise this loop too */
2303       if ((ctr_j = ctr[j])) {
2304         sbuf1_j        = sbuf1[j];
2305         k              = ++sbuf1_j[0];
2306         sbuf1_j[2*k]   = ctr_j;
2307         sbuf1_j[2*k-1] = i;
2308       }
2309     }
2310   }
2311 
2312   /*  Now  post the sends */
2313   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2314   for (i=0; i<nrqs; ++i) {
2315     j    = pa[i];
2316     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2317   }
2318 
2319   /* Post Receives to capture the buffer size */
2320   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2321   //ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
2322   ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2323   rbuf2[0] = tmp + msz;
2324   for (i=1; i<nrqs; ++i) {
2325     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2326   }
2327   for (i=0; i<nrqs; ++i) {
2328     j    = pa[i];
2329     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
2330   }
2331 
2332   /* Send to other procs the buf size they should allocate */
2333 
2334 
2335   /* Receive messages*/
2336   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2337   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2338   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2339   {
2340     PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2341     PetscInt   *sbuf2_i;
2342 
2343     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2344     for (i=0; i<nrqr; ++i) {
2345       //ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
2346       idex           = i;
2347       req_size[idex] = 0;
2348       rbuf1_i        = rbuf1[idex];
2349       start          = 2*rbuf1_i[0] + 1;
2350       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2351       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
2352       sbuf2_i        = sbuf2[idex];
2353       for (j=start; j<end; j++) {
2354         id              = rbuf1_i[j] - rstart;
2355         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2356         sbuf2_i[j]      = ncols;
2357         req_size[idex] += ncols;
2358       }
2359       req_source1[idex] = r_status1[i].MPI_SOURCE;
2360       /* form the header */
2361       sbuf2_i[0] = req_size[idex];
2362       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2363 
2364       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
2365     }
2366   }
2367   ierr = PetscFree(r_status1);CHKERRQ(ierr);
2368   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2369 
2370   /*  recv buffer sizes */
2371   /* Receive messages*/
2372 
2373   //ierr = PetscMalloc2(nrqs+1,&req_source2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2374   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2375   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2376   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2377   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2378 
2379   ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2380   for (i=0; i<nrqs; ++i) {
2381     idex = i;
2382     //ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
2383     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
2384     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
2385     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
2386     req_source2[i] = r_status2[i].MPI_SOURCE;
2387     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,req_source2[i],tag3,comm,r_waits4+idex);CHKERRQ(ierr);
2388   }
2389   ierr = PetscFree(r_status2);CHKERRQ(ierr);
2390   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2391 
2392   /* Wait on sends1 and sends2 */
2393   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2394   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2395 
2396   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2397   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2398   ierr = PetscFree(s_status1);CHKERRQ(ierr);
2399   ierr = PetscFree(s_status2);CHKERRQ(ierr);
2400   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2401   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2402 
2403   /* Now allocate buffers for a->j, and send them off */
2404   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2405   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2406   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2407   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2408 
2409   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2410   {
2411     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2412     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2413     PetscInt cend = C->cmap->rend;
2414     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2415 
2416     for (i=0; i<nrqr; i++) {
2417       rbuf1_i   = rbuf1[i];
2418       sbuf_aj_i = sbuf_aj[i];
2419       ct1       = 2*rbuf1_i[0] + 1;
2420       ct2       = 0;
2421       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2422         kmax = rbuf1[i][2*j];
2423         for (k=0; k<kmax; k++,ct1++) {
2424           row    = rbuf1_i[ct1] - rstart;
2425           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2426           ncols  = nzA + nzB;
2427           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2428 
2429           /* load the column indices for this row into cols*/
2430           cols = sbuf_aj_i + ct2;
2431 
2432           lwrite = 0;
2433           for (l=0; l<nzB; l++) {
2434             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2435           }
2436           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2437           for (l=0; l<nzB; l++) {
2438             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2439           }
2440 
2441           ct2 += ncols;
2442         }
2443       }
2444       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
2445     }
2446   }
2447   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
2448   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
2449 
2450   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2451   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2452 
2453   /* create col map: global col of C -> local col of submatrices */
2454   if (scall == MAT_INITIAL_MATRIX) {
2455     const PetscInt *icol_i;
2456 #if defined(PETSC_USE_CTABLE)
2457     for (i=0; i<ismax; i++) {
2458       if (!allcolumns[i]) {
2459         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2460 
2461         jmax   = ncol[i];
2462         icol_i = icol[i];
2463         cmap_i = cmap[i];
2464         for (j=0; j<jmax; j++) {
2465           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2466         }
2467       } else cmap[i] = NULL;
2468     }
2469 #else
2470     for (i=0; i<ismax; i++) {
2471       if (!allcolumns[i]) {
2472         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2473         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2474         jmax   = ncol[i];
2475         icol_i = icol[i];
2476         cmap_i = cmap[i];
2477         for (j=0; j<jmax; j++) {
2478           cmap_i[icol_i[j]] = j+1;
2479         }
2480       } else cmap[i] = NULL;
2481     }
2482 #endif
2483   }
2484 
2485   /* Create lens which is required for MatCreate... */
2486   for (i=0,j=0; i<ismax; i++) j += nrow[i];
2487   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2488   if (ismax) {
2489     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
2490     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
2491   }
2492   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2493 
2494   /* Update lens from local data */
2495   for (i=0; i<ismax; i++) {
2496     row2proc_i = row2proc[i];
2497     jmax = nrow[i];
2498     if (!allcolumns[i]) cmap_i = cmap[i];
2499     irow_i = irow[i];
2500     lens_i = lens[i];
2501     for (j=0; j<jmax; j++) {
2502       row = irow_i[j];
2503       proc = row2proc_i[j];
2504       if (proc == rank) {
2505         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2506         if (!allcolumns[i]) {
2507           for (k=0; k<ncols; k++) {
2508 #if defined(PETSC_USE_CTABLE)
2509             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2510 #else
2511             tcol = cmap_i[cols[k]];
2512 #endif
2513             if (tcol) lens_i[j]++;
2514           }
2515         } else { /* allcolumns */
2516           lens_i[j] = ncols;
2517         }
2518         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2519       }
2520     }
2521   }
2522 
2523   /* Create row map: global row of C -> local row of submatrices */
2524   if (scall == MAT_INITIAL_MATRIX) {
2525 #if defined(PETSC_USE_CTABLE)
2526     for (i=0; i<ismax; i++) {
2527       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2528       irow_i = irow[i];
2529       jmax   = nrow[i];
2530       for (j=0; j<jmax; j++) {
2531       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2532       }
2533     }
2534 #else
2535     if (ismax) {
2536       ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
2537       ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2538     }
2539     for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
2540     for (i=0; i<ismax; i++) {
2541       rmap_i = rmap[i];
2542       irow_i = irow[i];
2543       jmax   = nrow[i];
2544       for (j=0; j<jmax; j++) {
2545         rmap_i[irow_i[j]] = j;
2546       }
2547     }
2548 #endif
2549   }
2550 
2551   /* Update lens from offproc data */
2552   {
2553     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2554 
2555     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2556     for (tmp2=0; tmp2<nrqs; tmp2++) {
2557       //ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
2558       idex2   = tmp2;
2559       idex    = pa[idex2];
2560       sbuf1_i = sbuf1[idex];
2561       jmax    = sbuf1_i[0];
2562       ct1     = 2*jmax+1;
2563       ct2     = 0;
2564       rbuf2_i = rbuf2[idex2];
2565       rbuf3_i = rbuf3[idex2];
2566       for (j=1; j<=jmax; j++) {
2567         is_no  = sbuf1_i[2*j-1];
2568         max1   = sbuf1_i[2*j];
2569         lens_i = lens[is_no];
2570         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2571         rmap_i = rmap[is_no];
2572         for (k=0; k<max1; k++,ct1++) {
2573 #if defined(PETSC_USE_CTABLE)
2574           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2575           row--;
2576           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2577 #else
2578           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2579 #endif
2580           max2 = rbuf2_i[ct1];
2581           for (l=0; l<max2; l++,ct2++) {
2582             if (!allcolumns[is_no]) {
2583 #if defined(PETSC_USE_CTABLE)
2584               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2585 #else
2586               tcol = cmap_i[rbuf3_i[ct2]];
2587 #endif
2588               if (tcol) lens_i[row]++;
2589             } else { /* allcolumns */
2590               lens_i[row]++; /* lens_i[row] += max2 ? */
2591             }
2592           }
2593         }
2594       }
2595     }
2596   }
2597   ierr = PetscFree(r_status3);CHKERRQ(ierr);
2598   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2599   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2600   ierr = PetscFree(s_status3);CHKERRQ(ierr);
2601   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2602 
2603   } // endof scall == MAT_INITIAL_MATRIX ||!ismax
2604 
2605   /* Create the submatrices */
2606   if (scall == MAT_INITIAL_MATRIX) {
2607     for (i=0; i<ismax; i++) {
2608       PetscInt    rbs,cbs;
2609       Mat_SeqAIJ* subc;
2610 
2611       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2612       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2613 
2614       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2615       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2616 
2617       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2618       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2619       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2620 
2621       /* create struct Mat_SubMat and attached it to submat */
2622       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2623       subc = (Mat_SeqAIJ*)submats[i]->data;
2624       subc->submatis1 = smat_i;
2625       smats[i]        = smat_i;
2626 
2627       smat_i->destroy          = submats[i]->ops->destroy;
2628       submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
2629       submats[i]->factortype   = C->factortype;
2630 
2631       smat_i->id          = i;
2632       smat_i->nrqs        = nrqs;
2633       smat_i->nrqr        = nrqr;
2634       smat_i->rbuf1       = rbuf1;
2635       smat_i->rbuf2       = rbuf2;
2636       smat_i->rbuf3       = rbuf3;
2637       smat_i->sbuf2       = sbuf2;
2638       smat_i->req_source2 = req_source2;
2639 
2640       smat_i->sbuf1       = sbuf1;
2641       smat_i->ptr         = ptr;
2642       smat_i->tmp         = tmp;
2643       smat_i->ctr         = ctr;
2644 
2645       smat_i->pa           = pa;
2646       smat_i->req_size     = req_size;
2647       smat_i->req_source1  = req_source1;
2648 
2649       smat_i->allcolumns  = allcolumns[i];
2650       smat_i->row2proc    = row2proc[i];
2651       smat_i->rmap        = rmap[i];
2652       smat_i->cmap        = cmap[i];
2653 #if defined(PETSC_USE_CTABLE)
2654       smat_i->rmap_loc    = NULL;//rmap_loc;
2655       smat_i->cmap_loc    = NULL;//cmap_loc;
2656 #endif
2657 
2658     }
2659   } // endof scall == MAT_INITIAL_MATRIX
2660 
2661   /* Allocate buffers for a->a, and send them off */
2662   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2663   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2664   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2665   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2666 
2667   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2668   {
2669     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2670     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2671     PetscInt    cend   = C->cmap->rend;
2672     PetscInt    *b_j   = b->j;
2673     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2674 
2675     for (i=0; i<nrqr; i++) {
2676       rbuf1_i   = rbuf1[i];
2677       sbuf_aa_i = sbuf_aa[i];
2678       ct1       = 2*rbuf1_i[0]+1;
2679       ct2       = 0;
2680       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2681         kmax = rbuf1_i[2*j];
2682         for (k=0; k<kmax; k++,ct1++) {
2683           row    = rbuf1_i[ct1] - rstart;
2684           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2685           ncols  = nzA + nzB;
2686           cworkB = b_j + b_i[row];
2687           vworkA = a_a + a_i[row];
2688           vworkB = b_a + b_i[row];
2689 
2690           /* load the column values for this row into vals*/
2691           vals = sbuf_aa_i+ct2;
2692 
2693           lwrite = 0;
2694           for (l=0; l<nzB; l++) {
2695             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2696           }
2697           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2698           for (l=0; l<nzB; l++) {
2699             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2700           }
2701 
2702           ct2 += ncols;
2703         }
2704       }
2705       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
2706     }
2707   }
2708 
2709   if (!ismax) {
2710     ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
2711     ierr = PetscFree(rbuf1);CHKERRQ(ierr);
2712   }
2713 
2714 
2715   /* Assemble the matrices */
2716   /* First assemble the local rows */
2717   {
2718     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2719     PetscScalar *imat_a;
2720 
2721     for (i=0; i<ismax; i++) {
2722       row2proc_i = row2proc[i];
2723       mat       = (Mat_SeqAIJ*)submats[i]->data;
2724       imat_ilen = mat->ilen;
2725       imat_j    = mat->j;
2726       imat_i    = mat->i;
2727       imat_a    = mat->a;
2728 
2729       if (!allcolumns[i]) cmap_i = cmap[i];
2730       rmap_i = rmap[i];
2731       irow_i = irow[i];
2732       jmax   = nrow[i];
2733       for (j=0; j<jmax; j++) {
2734         row  = irow_i[j];
2735         proc = row2proc_i[j];
2736         if (proc == rank) {
2737           old_row = row;
2738 #if defined(PETSC_USE_CTABLE)
2739           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2740           row--;
2741 #else
2742           row = rmap_i[row];
2743 #endif
2744           ilen_row = imat_ilen[row];
2745           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2746           mat_i    = imat_i[row];
2747           mat_a    = imat_a + mat_i;
2748           mat_j    = imat_j + mat_i;
2749           if (!allcolumns[i]) {
2750             for (k=0; k<ncols; k++) {
2751 #if defined(PETSC_USE_CTABLE)
2752               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2753 #else
2754               tcol = cmap_i[cols[k]];
2755 #endif
2756               if (tcol) {
2757                 *mat_j++ = tcol - 1;
2758                 *mat_a++ = vals[k];
2759                 ilen_row++;
2760               }
2761             }
2762           } else { /* allcolumns */
2763             for (k=0; k<ncols; k++) {
2764               *mat_j++ = cols[k];  /* global col index! */
2765               *mat_a++ = vals[k];
2766               ilen_row++;
2767             }
2768           }
2769           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2770 
2771           imat_ilen[row] = ilen_row;
2772         }
2773       }
2774     }
2775   }
2776 
2777   /*   Now assemble the off proc rows*/
2778   {
2779     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
2780     PetscInt    *imat_j,*imat_i;
2781     PetscScalar *imat_a,*rbuf4_i;
2782 
2783     ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2784     for (tmp2=0; tmp2<nrqs; tmp2++) {
2785       //ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
2786       idex2   = tmp2;
2787       idex    = pa[idex2];
2788       sbuf1_i = sbuf1[idex];
2789       jmax    = sbuf1_i[0];
2790       ct1     = 2*jmax + 1;
2791       ct2     = 0;
2792       rbuf2_i = rbuf2[idex2];
2793       rbuf3_i = rbuf3[idex2];
2794       rbuf4_i = rbuf4[idex2];
2795       for (j=1; j<=jmax; j++) {
2796         is_no     = sbuf1_i[2*j-1];
2797         rmap_i    = rmap[is_no];
2798         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2799         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
2800         imat_ilen = mat->ilen;
2801         imat_j    = mat->j;
2802         imat_i    = mat->i;
2803         imat_a    = mat->a;
2804         max1      = sbuf1_i[2*j];
2805         for (k=0; k<max1; k++,ct1++) {
2806           row = sbuf1_i[ct1];
2807 #if defined(PETSC_USE_CTABLE)
2808           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2809           row--;
2810 #else
2811           row = rmap_i[row];
2812 #endif
2813           ilen  = imat_ilen[row];
2814           mat_i = imat_i[row];
2815           mat_a = imat_a + mat_i;
2816           mat_j = imat_j + mat_i;
2817           max2  = rbuf2_i[ct1];
2818           if (!allcolumns[is_no]) {
2819             for (l=0; l<max2; l++,ct2++) {
2820 
2821 #if defined(PETSC_USE_CTABLE)
2822               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2823 #else
2824               tcol = cmap_i[rbuf3_i[ct2]];
2825 #endif
2826               if (tcol) {
2827                 *mat_j++ = tcol - 1;
2828                 *mat_a++ = rbuf4_i[ct2];
2829                 ilen++;
2830               }
2831             }
2832           } else { /* allcolumns */
2833             for (l=0; l<max2; l++,ct2++) {
2834               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2835               *mat_a++ = rbuf4_i[ct2];
2836               ilen++;
2837             }
2838           }
2839           imat_ilen[row] = ilen;
2840         }
2841       }
2842     }
2843   }
2844 
2845   /* sort the rows */
2846   {
2847     PetscInt    *imat_ilen,*imat_j,*imat_i;
2848     PetscScalar *imat_a;
2849 
2850     for (i=0; i<ismax; i++) {
2851       mat       = (Mat_SeqAIJ*)submats[i]->data;
2852       imat_j    = mat->j;
2853       imat_i    = mat->i;
2854       imat_a    = mat->a;
2855       imat_ilen = mat->ilen;
2856 
2857       if (allcolumns[i]) continue;
2858       jmax = nrow[i];
2859       for (j=0; j<jmax; j++) {
2860         PetscInt ilen;
2861 
2862         mat_i = imat_i[j];
2863         mat_a = imat_a + mat_i;
2864         mat_j = imat_j + mat_i;
2865         ilen  = imat_ilen[j];
2866         ierr  = PetscSortIntWithScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
2867       }
2868     }
2869   }
2870 
2871   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2872   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2873   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2874   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2875   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2876 
2877   /* Restore the indices */
2878   for (i=0; i<ismax; i++) {
2879     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2880     if (!allcolumns[i]) {
2881       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2882     }
2883   }
2884 
2885   /* Destroy allocated memory */
2886   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
2887   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2888 
2889   if (!ismax) {
2890     ierr = PetscFree(pa);CHKERRQ(ierr);
2891 
2892     ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
2893     for (i=0; i<nrqr; ++i) {
2894       ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
2895     }
2896     for (i=0; i<nrqs; ++i) {
2897       ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
2898     }
2899 
2900     ierr = PetscFree3(sbuf2,req_size,req_source1);CHKERRQ(ierr);
2901     ierr = PetscFree3(req_source2,rbuf2,rbuf3);CHKERRQ(ierr);
2902   }
2903 
2904   for (i=0; i<nrqs; ++i) {
2905     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2906   }
2907   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2908 
2909   if (scall == MAT_INITIAL_MATRIX) {
2910     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2911     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2912   }
2913   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2914   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2915 
2916   ierr = PetscFree(rmap);CHKERRQ(ierr);
2917   ierr = PetscFree(cmap);CHKERRQ(ierr);
2918 
2919   if (scall == MAT_INITIAL_MATRIX) {
2920     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2921     ierr = PetscFree(lens);CHKERRQ(ierr);
2922   }
2923 
2924   for (i=0; i<ismax; i++) {
2925     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2926     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2927   }
2928 
2929   ierr = PetscFree(issorted);CHKERRQ(ierr);
2930   ierr = PetscFree(row2proc);CHKERRQ(ierr);
2931   ierr = PetscFree(smats);CHKERRQ(ierr);
2932   PetscFunctionReturn(0);
2933 }
2934 
2935 /*
2936  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2937  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2938  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2939  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2940  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2941  state, and needs to be "assembled" later by compressing B's column space.
2942 
2943  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2944  Following this call, C->A & C->B have been created, even if empty.
2945  */
2946 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2947 {
2948   /* If making this function public, change the error returned in this function away from _PLIB. */
2949   PetscErrorCode ierr;
2950   Mat_MPIAIJ     *aij;
2951   Mat_SeqAIJ     *Baij;
2952   PetscBool      seqaij,Bdisassembled;
2953   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2954   PetscScalar    v;
2955   const PetscInt *rowindices,*colindices;
2956 
2957   PetscFunctionBegin;
2958   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2959   if (A) {
2960     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2961     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2962     if (rowemb) {
2963       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2964       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2965     } else {
2966       if (C->rmap->n != A->rmap->n) {
2967 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2968       }
2969     }
2970     if (dcolemb) {
2971       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2972       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2973     } else {
2974       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2975     }
2976   }
2977   if (B) {
2978     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2979     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2980     if (rowemb) {
2981       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2982       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2983     } else {
2984       if (C->rmap->n != B->rmap->n) {
2985 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2986       }
2987     }
2988     if (ocolemb) {
2989       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2990       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2991     } else {
2992       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2993     }
2994   }
2995 
2996   aij    = (Mat_MPIAIJ*)(C->data);
2997   if (!aij->A) {
2998     /* Mimic parts of MatMPIAIJSetPreallocation() */
2999     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
3000     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
3001     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
3002     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
3003     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
3004   }
3005   if (A) {
3006     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
3007   } else {
3008     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
3009   }
3010   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
3011     /*
3012       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
3013       need to "disassemble" B -- convert it to using C's global indices.
3014       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
3015 
3016       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
3017       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
3018 
3019       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
3020       At least avoid calling MatSetValues() and the implied searches?
3021     */
3022 
3023     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
3024 #if defined(PETSC_USE_CTABLE)
3025       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
3026 #else
3027       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
3028       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
3029       if (aij->B) {
3030         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
3031       }
3032 #endif
3033       ngcol = 0;
3034       if (aij->lvec) {
3035 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
3036       }
3037       if (aij->garray) {
3038 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
3039 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
3040       }
3041       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
3042       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
3043     }
3044     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
3045       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
3046     }
3047     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3048       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
3049     }
3050   }
3051   Bdisassembled = PETSC_FALSE;
3052   if (!aij->B) {
3053     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3054     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3055     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3056     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3057     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3058     Bdisassembled = PETSC_TRUE;
3059   }
3060   if (B) {
3061     Baij = (Mat_SeqAIJ*)(B->data);
3062     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3063       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3064       for (i=0; i<B->rmap->n; i++) {
3065 	nz[i] = Baij->i[i+1] - Baij->i[i];
3066       }
3067       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3068       ierr = PetscFree(nz);CHKERRQ(ierr);
3069     }
3070 
3071     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3072     shift = rend-rstart;
3073     count = 0;
3074     rowindices = NULL;
3075     colindices = NULL;
3076     if (rowemb) {
3077       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3078     }
3079     if (ocolemb) {
3080       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3081     }
3082     for (i=0; i<B->rmap->n; i++) {
3083       PetscInt row;
3084       row = i;
3085       if (rowindices) row = rowindices[i];
3086       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3087 	col  = Baij->j[count];
3088 	if (colindices) col = colindices[col];
3089 	if (Bdisassembled && col>=rstart) col += shift;
3090 	v    = Baij->a[count];
3091 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3092 	++count;
3093       }
3094     }
3095     /* No assembly for aij->B is necessary. */
3096     /* FIXME: set aij->B's nonzerostate correctly. */
3097   } else {
3098     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3099   }
3100   C->preallocated  = PETSC_TRUE;
3101   C->was_assembled = PETSC_FALSE;
3102   C->assembled     = PETSC_FALSE;
3103    /*
3104       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3105       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3106    */
3107   PetscFunctionReturn(0);
3108 }
3109 
3110 /*
3111   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3112  */
3113 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3114 {
3115   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3116 
3117   PetscFunctionBegin;
3118   PetscValidPointer(A,2);
3119   PetscValidPointer(B,3);
3120   /* FIXME: make sure C is assembled */
3121   *A = aij->A;
3122   *B = aij->B;
3123   /* Note that we don't incref *A and *B, so be careful! */
3124   PetscFunctionReturn(0);
3125 }
3126 
3127 /*
3128   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3129   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3130 */
3131 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3132                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3133 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3134 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3135 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3136 {
3137   PetscErrorCode ierr;
3138   PetscMPIInt    isize,flag;
3139   PetscInt       i,ii,cismax,ispar;
3140   Mat            *A,*B;
3141   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3142 
3143   PetscFunctionBegin;
3144   if (!ismax) PetscFunctionReturn(0);
3145 
3146   for (i = 0, cismax = 0; i < ismax; ++i) {
3147     PetscMPIInt isize;
3148     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3149     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3150     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3151     if (isize > 1) ++cismax;
3152   }
3153 
3154   /*
3155      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3156      ispar counts the number of parallel ISs across C's comm.
3157   */
3158   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3159   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3160     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3161     PetscFunctionReturn(0);
3162   }
3163 
3164   /* if (ispar) */
3165   /*
3166     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3167     These are used to extract the off-diag portion of the resulting parallel matrix.
3168     The row IS for the off-diag portion is the same as for the diag portion,
3169     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3170   */
3171   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3172   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3173   for (i = 0, ii = 0; i < ismax; ++i) {
3174     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3175     if (isize > 1) {
3176       /*
3177 	 TODO: This is the part that's ***NOT SCALABLE***.
3178 	 To fix this we need to extract just the indices of C's nonzero columns
3179 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3180 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3181 	 to be done without serializing on the IS list, so, most likely, it is best
3182 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3183       */
3184       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3185       /* Now we have to
3186 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3187 	     were sorted on each rank, concatenated they might no longer be sorted;
3188 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3189 	     indices in the nondecreasing order to the original index positions.
3190 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3191       */
3192       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3193       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3194       ++ii;
3195     }
3196   }
3197   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3198   for (i = 0, ii = 0; i < ismax; ++i) {
3199     PetscInt       j,issize;
3200     const PetscInt *indices;
3201 
3202     /*
3203        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3204      */
3205     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3206     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3207     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3208     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3209     for (j = 1; j < issize; ++j) {
3210       if (indices[j] == indices[j-1]) {
3211 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3212       }
3213     }
3214     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3215 
3216 
3217     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3218     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3219     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3220     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3221     for (j = 1; j < issize; ++j) {
3222       if (indices[j-1] == indices[j]) {
3223 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3224       }
3225     }
3226     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3227     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3228     if (isize > 1) {
3229       cisrow[ii] = isrow[i];
3230       ++ii;
3231     }
3232   }
3233   /*
3234     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3235     array of sequential matrices underlying the resulting parallel matrices.
3236     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3237     contain duplicates.
3238 
3239     There are as many diag matrices as there are original index sets. There are only as many parallel
3240     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3241 
3242     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3243     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3244       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3245       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3246       with A[i] and B[ii] extracted from the corresponding MPI submat.
3247     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3248       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3249       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3250       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3251       values into A[i] and B[ii] sitting inside the corresponding submat.
3252     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3253       A[i], B[ii], AA[i] or BB[ii] matrices.
3254   */
3255   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3256   if (scall == MAT_INITIAL_MATRIX) {
3257     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3258   }
3259 
3260   /* Now obtain the sequential A and B submatrices separately. */
3261   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3262   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3263   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3264 
3265   /*
3266     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3267     matrices A & B have been extracted directly into the parallel matrices containing them, or
3268     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3269     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3270     to have the same sparsity pattern.
3271     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3272     must be constructed for C. This is done by setseqmat(s).
3273   */
3274   for (i = 0, ii = 0; i < ismax; ++i) {
3275     /*
3276        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3277        That way we can avoid sorting and computing permutations when reusing.
3278        To this end:
3279         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3280 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3281 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3282     */
3283     MatStructure pattern;
3284     pattern = DIFFERENT_NONZERO_PATTERN;
3285 
3286     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3287     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3288     if (isize > 1) {
3289       if (scall == MAT_INITIAL_MATRIX) {
3290 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3291 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3292 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3293 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3294 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3295       }
3296       /*
3297 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3298       */
3299       {
3300 	Mat AA,BB;
3301         AA = A[i];
3302         BB = B[ii];
3303 	if (AA || BB) {
3304 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3305 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3306 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3307 	}
3308 
3309         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3310         ierr = MatDestroy(&BB);CHKERRQ(ierr);
3311       }
3312       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3313       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3314       ++ii;
3315     } else { /* if (isize == 1) */
3316       if (scall == MAT_REUSE_MATRIX) {
3317         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3318       }
3319       if (isrow_p[i] || iscol_p[i]) {
3320         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3321         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3322 	/* Otherwise A is extracted straight into (*submats)[i]. */
3323 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3324 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3325       } else (*submat)[i] = A[i];
3326     }
3327     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3328     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3329   }
3330   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3331   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3332   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3333   ierr = PetscFree(A);CHKERRQ(ierr);
3334   ierr = PetscFree(B);CHKERRQ(ierr);
3335   PetscFunctionReturn(0);
3336 }
3337 
3338 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3339 {
3340   PetscErrorCode ierr;
3341 
3342   PetscFunctionBegin;
3343   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3344   PetscFunctionReturn(0);
3345 }
3346