xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 70141dcc13022b66eb17bf6f84e28491c2a2e3b1)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode   ierr;
52   MPI_Comm         comm;
53   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt  *indices_i,**indices;
57   PetscLayout      rmap;
58   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF          sf;
60   PetscSFNode     *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1038 /*
1039     Every processor gets the entire matrix
1040 */
1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1042 {
1043   Mat            B;
1044   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1045   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1046   PetscErrorCode ierr;
1047   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1048   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1049   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1050   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1051 
1052   PetscFunctionBegin;
1053   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1054   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1055 
1056   if (scall == MAT_INITIAL_MATRIX) {
1057     /* ----------------------------------------------------------------
1058          Tell every processor the number of nonzeros per row
1059     */
1060     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1061     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1062       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1063     }
1064     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1065     for (i=0; i<size; i++) {
1066       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1067       displs[i]     = A->rmap->range[i];
1068     }
1069 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1070     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1071 #else
1072     sendcount = A->rmap->rend - A->rmap->rstart;
1073     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1074 #endif
1075     /* ---------------------------------------------------------------
1076          Create the sequential matrix of the same type as the local block diagonal
1077     */
1078     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1079     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1080     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1081     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1082     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1083     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1084     **Bin = B;
1085     b     = (Mat_SeqAIJ*)B->data;
1086 
1087     /*--------------------------------------------------------------------
1088        Copy my part of matrix column indices over
1089     */
1090     sendcount  = ad->nz + bd->nz;
1091     jsendbuf   = b->j + b->i[rstarts[rank]];
1092     a_jsendbuf = ad->j;
1093     b_jsendbuf = bd->j;
1094     n          = A->rmap->rend - A->rmap->rstart;
1095     cnt        = 0;
1096     for (i=0; i<n; i++) {
1097 
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     sendcount = ad->nz + bd->nz;
1161     sendbuf   = b->a + b->i[rstarts[rank]];
1162     a_sendbuf = ad->a;
1163     b_sendbuf = bd->a;
1164     b_sendj   = bd->j;
1165     n         = A->rmap->rend - A->rmap->rstart;
1166     cnt       = 0;
1167     for (i=0; i<n; i++) {
1168 
1169       /* put in lower diagonal portion */
1170       m = bd->i[i+1] - bd->i[i];
1171       while (m > 0) {
1172         /* is it above diagonal (in bd (compressed) numbering) */
1173         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1174         sendbuf[cnt++] = *b_sendbuf++;
1175         m--;
1176         b_sendj++;
1177       }
1178 
1179       /* put in diagonal portion */
1180       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1181         sendbuf[cnt++] = *a_sendbuf++;
1182       }
1183 
1184       /* put in upper diagonal portion */
1185       while (m-- > 0) {
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         b_sendj++;
1188       }
1189     }
1190     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1191 
1192     /* -----------------------------------------------------------------
1193        Gather all numerical values to all processors
1194     */
1195     if (!recvcounts) {
1196       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1197     }
1198     for (i=0; i<size; i++) {
1199       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1200     }
1201     displs[0] = 0;
1202     for (i=1; i<size; i++) {
1203       displs[i] = displs[i-1] + recvcounts[i-1];
1204     }
1205     recvbuf = b->a;
1206 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1207     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1208 #else
1209     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1210 #endif
1211   }  /* endof (flag == MAT_GET_VALUES) */
1212   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1213 
1214   if (A->symmetric) {
1215     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1216   } else if (A->hermitian) {
1217     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1218   } else if (A->structurally_symmetric) {
1219     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1220   }
1221   PetscFunctionReturn(0);
1222 }
1223 
1224 PetscErrorCode MatDestroy_Dummy_MatGetSubmatrices(Mat C)
1225 {
1226   PetscErrorCode ierr;
1227   Mat_SubMat     *submatj = (Mat_SubMat*)C->spptr;
1228   PetscInt       i;
1229 
1230   PetscFunctionBegin;
1231   printf("MatDestroy_Dummy_MatGetSubmatrices...\n");
1232 
1233   if (!submatj->id) { /* delete data that are linked only to submats[id=0] */
1234     ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1235 
1236     for (i=0; i<submatj->nrqr; ++i) {
1237       ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1238     }
1239     ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1240 
1241     if (submatj->rbuf1) {
1242       ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1243       ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1244     }
1245 
1246     for (i=0; i<submatj->nrqs; ++i) {
1247       ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1248     }
1249     ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1250     ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1251   }
1252 
1253 #if defined(PETSC_USE_CTABLE)
1254   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1255   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1256   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1257 #else
1258   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1259 #endif
1260 
1261   if (!submatj->allcolumns) {
1262 #if defined(PETSC_USE_CTABLE)
1263     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1264 #else
1265     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1266 #endif
1267   }
1268   ierr = submatj->destroy(C);CHKERRQ(ierr);
1269   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1270 
1271   ierr = PetscFree(submatj);CHKERRQ(ierr);
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1276 {
1277   PetscErrorCode ierr;
1278   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1279   Mat_SubMat     *submatj = c->submatis1;
1280   PetscInt       i;
1281 
1282   PetscFunctionBegin;
1283   if (!submatj->id) { /* delete data that are linked only to submats[id=0] */
1284     ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1285 
1286     for (i=0; i<submatj->nrqr; ++i) {
1287       ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1288     }
1289     ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1290 
1291     if (submatj->rbuf1) {
1292       ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1293       ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1294     }
1295 
1296     for (i=0; i<submatj->nrqs; ++i) {
1297       ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1298     }
1299     ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1300     ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1301   }
1302 
1303 #if defined(PETSC_USE_CTABLE)
1304   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1305   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1306   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1307 #else
1308   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1309 #endif
1310 
1311   if (!submatj->allcolumns) {
1312 #if defined(PETSC_USE_CTABLE)
1313     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1314 #else
1315     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1316 #endif
1317   }
1318   ierr = submatj->destroy(C);CHKERRQ(ierr);
1319   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1320 
1321   ierr = PetscFree(submatj);CHKERRQ(ierr);
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1326 {
1327   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1328   Mat            submat,A = c->A,B = c->B;
1329   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1330   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1331   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1332   const PetscInt *icol,*irow;
1333   PetscInt       nrow,ncol,start;
1334   PetscErrorCode ierr;
1335   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1336   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1337   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1338   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1339   PetscInt       *lens,rmax,ncols,*cols,Crow;
1340 #if defined(PETSC_USE_CTABLE)
1341   PetscTable     cmap,rmap;
1342   PetscInt       *cmap_loc,*rmap_loc;
1343 #else
1344   PetscInt       *cmap,*rmap;
1345 #endif
1346   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1347   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1348   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1349   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1350   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1351   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1352   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1353   MPI_Comm       comm;
1354   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1355   PetscMPIInt    *onodes1,*olengths1,idex,end;
1356   Mat_SubMat     *smatis1;
1357   PetscBool      isrowsorted;
1358 
1359   PetscFunctionBegin;
1360   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1361 
1362   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1363   size = c->size;
1364   rank = c->rank;
1365 
1366   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1367   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1368 
1369   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1370   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1371   if (allcolumns) {
1372     icol = NULL;
1373     ncol = C->cmap->N;
1374   } else {
1375     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1376     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1377   }
1378 
1379   if (scall == MAT_INITIAL_MATRIX) {
1380     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1381 
1382     /* Get some new tags to keep the communication clean */
1383     tag1 = ((PetscObject)C)->tag;
1384     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1385     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1386 
1387     /* evaluate communication - mesg to who, length of mesg, and buffer space
1388      required. Based on this, buffers are allocated, and data copied into them */
1389     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1390     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1391 
1392     /* w1[proc] = num of rows owned by proc -- to be requested */
1393     proc = 0;
1394     nrqs = 0; /* num of outgoing messages */
1395     for (j=0; j<nrow; j++) {
1396       row  = irow[j]; /* sorted! */
1397       while (row >= C->rmap->range[proc+1]) proc++;
1398       w1[proc]++;
1399       row2proc[j] = proc; /* map row index to proc */
1400 
1401       if (proc != rank && !w2[proc]) {
1402         w2[proc] = 1; nrqs++;
1403       }
1404     }
1405     w1[rank] = 0;  /* rows owned by self will not be requested */
1406 
1407     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1408     for (proc=0,j=0; proc<size; proc++) {
1409       if (w1[proc]) { pa[j++] = proc;}
1410     }
1411 
1412     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1413     msz = 0;              /* total mesg length (for all procs) */
1414     for (i=0; i<nrqs; i++) {
1415       proc      = pa[i];
1416       w1[proc] += 3;
1417       msz      += w1[proc];
1418     }
1419     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1420 
1421     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1422     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1423     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1424 
1425     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1426        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1427     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1428 
1429     /* Now post the Irecvs corresponding to these messages */
1430     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1431 
1432     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1433     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1434 
1435     /* Allocate Memory for outgoing messages */
1436     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1437     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1438     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1439 
1440     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1441     iptr = tmp;
1442     for (i=0; i<nrqs; i++) {
1443       proc        = pa[i];
1444       sbuf1[proc] = iptr;
1445       iptr       += w1[proc];
1446     }
1447 
1448     /* Form the outgoing messages */
1449     /* Initialize the header space */
1450     for (i=0; i<nrqs; i++) {
1451       proc      = pa[i];
1452       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1453       ptr[proc] = sbuf1[proc] + 3;
1454     }
1455 
1456     /* Parse the isrow and copy data into outbuf */
1457     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1458     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1459       proc = row2proc[j];
1460       if (proc != rank) { /* copy to the outgoing buf*/
1461         *ptr[proc] = irow[j];
1462         ctr[proc]++; ptr[proc]++;
1463       }
1464     }
1465 
1466     /* Update the headers for the current IS */
1467     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1468       if ((ctr_j = ctr[j])) {
1469         sbuf1_j        = sbuf1[j];
1470         k              = ++sbuf1_j[0];
1471         sbuf1_j[2*k]   = ctr_j;
1472         sbuf1_j[2*k-1] = 0;
1473       }
1474     }
1475 
1476     /* Now post the sends */
1477     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1478     for (i=0; i<nrqs; ++i) {
1479       proc = pa[i];
1480       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1481     }
1482 
1483     /* Post Receives to capture the buffer size */
1484     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1485     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1486 
1487     rbuf2[0] = tmp + msz;
1488     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1489 
1490     for (i=0; i<nrqs; ++i) {
1491       proc = pa[i];
1492       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1493     }
1494 
1495     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1496 
1497     /* Send to other procs the buf size they should allocate */
1498     /* Receive messages*/
1499     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1500     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1501 
1502     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1503     for (i=0; i<nrqr; ++i) {
1504       req_size[i] = 0;
1505       rbuf1_i        = rbuf1[i];
1506       start          = 2*rbuf1_i[0] + 1;
1507       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1508       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1509       sbuf2_i        = sbuf2[i];
1510       for (j=start; j<end; j++) {
1511         k            = rbuf1_i[j] - rstart;
1512         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1513         sbuf2_i[j]   = ncols;
1514         req_size[i] += ncols;
1515       }
1516       req_source1[i] = r_status1[i].MPI_SOURCE;
1517 
1518       /* form the header */
1519       sbuf2_i[0] = req_size[i];
1520       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1521 
1522       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1523     }
1524 
1525     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1526     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1527 
1528     /* rbuf2 is received, Post recv column indices a->j */
1529     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1530 
1531     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1532     for (i=0; i<nrqs; ++i) {
1533       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1534       req_source2[i] = r_status2[i].MPI_SOURCE;
1535       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1536     }
1537 
1538     /* Wait on sends1 and sends2 */
1539     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1540     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1541     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1542     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1543 
1544     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1545     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1546 
1547     /* Now allocate sending buffers for a->j, and send them off */
1548     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1549     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1550     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1551     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1552 
1553     for (i=0; i<nrqr; i++) { /* for each requested message */
1554       rbuf1_i   = rbuf1[i];
1555       sbuf_aj_i = sbuf_aj[i];
1556       ct1       = 2*rbuf1_i[0] + 1;
1557       ct2       = 0;
1558       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1559 
1560       kmax = rbuf1[i][2];
1561       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1562         row    = rbuf1_i[ct1] - rstart;
1563         nzA    = ai[row+1] - ai[row];
1564         nzB    = bi[row+1] - bi[row];
1565         ncols  = nzA + nzB;
1566         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1567 
1568         /* load the column indices for this row into cols*/
1569         cols = sbuf_aj_i + ct2;
1570 
1571         lwrite = 0;
1572         for (l=0; l<nzB; l++) {
1573           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1574         }
1575         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1576         for (l=0; l<nzB; l++) {
1577           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1578         }
1579 
1580         ct2 += ncols;
1581       }
1582       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1583     }
1584 
1585     /* create column map (cmap): global col of C -> local col of submat */
1586 #if defined(PETSC_USE_CTABLE)
1587     if (!allcolumns) {
1588       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1589       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1590       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1591         if (icol[j] >= cstart && icol[j] <cend) {
1592           cmap_loc[icol[j] - cstart] = j+1;
1593         } else { /* use PetscTable for non-local col indices */
1594           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1595         }
1596       }
1597     } else {
1598       cmap     = NULL;
1599       cmap_loc = NULL;
1600     }
1601     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1602 #else
1603     if (!allcolumns) {
1604       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1605       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1606     } else {
1607       cmap = NULL;
1608     }
1609 #endif
1610 
1611     /* Create lens for MatSeqAIJSetPreallocation() */
1612     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1613 
1614     /* Compute lens from local part of C */
1615     for (j=0; j<nrow; j++) {
1616       row  = irow[j];
1617       proc = row2proc[j];
1618       if (proc == rank) {
1619         /* diagonal part A = c->A */
1620         ncols = ai[row-rstart+1] - ai[row-rstart];
1621         cols  = aj + ai[row-rstart];
1622         if (!allcolumns) {
1623           for (k=0; k<ncols; k++) {
1624 #if defined(PETSC_USE_CTABLE)
1625             tcol = cmap_loc[cols[k]];
1626 #else
1627             tcol = cmap[cols[k]+cstart];
1628 #endif
1629             if (tcol) lens[j]++;
1630           }
1631         } else { /* allcolumns */
1632           lens[j] = ncols;
1633         }
1634 
1635         /* off-diagonal part B = c->B */
1636         ncols = bi[row-rstart+1] - bi[row-rstart];
1637         cols  = bj + bi[row-rstart];
1638         if (!allcolumns) {
1639           for (k=0; k<ncols; k++) {
1640 #if defined(PETSC_USE_CTABLE)
1641             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1642 #else
1643             tcol = cmap[bmap[cols[k]]];
1644 #endif
1645             if (tcol) lens[j]++;
1646           }
1647         } else { /* allcolumns */
1648           lens[j] += ncols;
1649         }
1650       }
1651     }
1652 
1653     /* Create row map (rmap): global row of C -> local row of submat */
1654 #if defined(PETSC_USE_CTABLE)
1655     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1656     for (j=0; j<nrow; j++) {
1657       row  = irow[j];
1658       proc = row2proc[j];
1659       if (proc == rank) { /* a local row */
1660         rmap_loc[row - rstart] = j;
1661       } else {
1662         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1663       }
1664     }
1665 #else
1666     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1667     for (j=0; j<nrow; j++) {
1668       rmap[irow[j]] = j;
1669     }
1670 #endif
1671 
1672     /* Update lens from offproc data */
1673     /* recv a->j is done */
1674     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1675     for (i=0; i<nrqs; i++) {
1676       proc    = pa[i];
1677       sbuf1_i = sbuf1[proc];
1678       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1679       ct1     = 2 + 1;
1680       ct2     = 0;
1681       rbuf2_i = rbuf2[i]; /* received length of C->j */
1682       rbuf3_i = rbuf3[i]; /* received C->j */
1683 
1684       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1685       max1   = sbuf1_i[2];
1686       for (k=0; k<max1; k++,ct1++) {
1687 #if defined(PETSC_USE_CTABLE)
1688         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1689         row--;
1690         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1691 #else
1692         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1693 #endif
1694         /* Now, store row index of submat in sbuf1_i[ct1] */
1695         sbuf1_i[ct1] = row;
1696 
1697         nnz = rbuf2_i[ct1];
1698         if (!allcolumns) {
1699           for (l=0; l<nnz; l++,ct2++) {
1700 #if defined(PETSC_USE_CTABLE)
1701             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1702               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1703             } else {
1704               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1705             }
1706 #else
1707             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1708 #endif
1709             if (tcol) lens[row]++;
1710           }
1711         } else { /* allcolumns */
1712           lens[row] += nnz;
1713         }
1714       }
1715     }
1716     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1717     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1718 
1719     /* Create the submatrices */
1720     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1721     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1722 
1723     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1724     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1725     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1726     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1727     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1728 
1729     /* create struct Mat_SubMat and attached it to submat */
1730     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1731     subc = (Mat_SeqAIJ*)submat->data;
1732     subc->submatis1 = smatis1;
1733 
1734     smatis1->id          = 0;
1735     smatis1->nrqs        = nrqs;
1736     smatis1->nrqr        = nrqr;
1737     smatis1->rbuf1       = rbuf1;
1738     smatis1->rbuf2       = rbuf2;
1739     smatis1->rbuf3       = rbuf3;
1740     smatis1->sbuf2       = sbuf2;
1741     smatis1->req_source2 = req_source2;
1742 
1743     smatis1->sbuf1       = sbuf1;
1744     smatis1->ptr         = ptr;
1745     smatis1->tmp         = tmp;
1746     smatis1->ctr         = ctr;
1747 
1748     smatis1->pa           = pa;
1749     smatis1->req_size     = req_size;
1750     smatis1->req_source1  = req_source1;
1751 
1752     smatis1->allcolumns  = allcolumns;
1753     smatis1->singleis    = PETSC_TRUE;
1754     smatis1->row2proc    = row2proc;
1755     smatis1->rmap        = rmap;
1756     smatis1->cmap        = cmap;
1757 #if defined(PETSC_USE_CTABLE)
1758     smatis1->rmap_loc    = rmap_loc;
1759     smatis1->cmap_loc    = cmap_loc;
1760 #endif
1761 
1762     smatis1->destroy     = submat->ops->destroy;
1763     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1764     submat->factortype   = C->factortype;
1765 
1766     /* compute rmax */
1767     rmax = 0;
1768     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1769 
1770   } else { /* scall == MAT_REUSE_MATRIX */
1771     submat = submats[0];
1772     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1773 
1774     subc    = (Mat_SeqAIJ*)submat->data;
1775     rmax    = subc->rmax;
1776     smatis1 = subc->submatis1;
1777     nrqs        = smatis1->nrqs;
1778     nrqr        = smatis1->nrqr;
1779     rbuf1       = smatis1->rbuf1;
1780     rbuf2       = smatis1->rbuf2;
1781     rbuf3       = smatis1->rbuf3;
1782     req_source2 = smatis1->req_source2;
1783 
1784     sbuf1     = smatis1->sbuf1;
1785     sbuf2     = smatis1->sbuf2;
1786     ptr       = smatis1->ptr;
1787     tmp       = smatis1->tmp;
1788     ctr       = smatis1->ctr;
1789 
1790     pa         = smatis1->pa;
1791     req_size   = smatis1->req_size;
1792     req_source1 = smatis1->req_source1;
1793 
1794     allcolumns = smatis1->allcolumns;
1795     row2proc   = smatis1->row2proc;
1796     rmap       = smatis1->rmap;
1797     cmap       = smatis1->cmap;
1798 #if defined(PETSC_USE_CTABLE)
1799     rmap_loc   = smatis1->rmap_loc;
1800     cmap_loc   = smatis1->cmap_loc;
1801 #endif
1802   }
1803 
1804   /* Post recv matrix values */
1805   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1806   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1807   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1808   for (i=0; i<nrqs; ++i) {
1809     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1810     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1811   }
1812 
1813   /* Allocate sending buffers for a->a, and send them off */
1814   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1815   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1816   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1817   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1818 
1819   for (i=0; i<nrqr; i++) {
1820     rbuf1_i   = rbuf1[i];
1821     sbuf_aa_i = sbuf_aa[i];
1822     ct1       = 2*rbuf1_i[0]+1;
1823     ct2       = 0;
1824     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1825 
1826     kmax = rbuf1_i[2];
1827     for (k=0; k<kmax; k++,ct1++) {
1828       row = rbuf1_i[ct1] - rstart;
1829       nzA = ai[row+1] - ai[row];
1830       nzB = bi[row+1] - bi[row];
1831       ncols  = nzA + nzB;
1832       cworkB = bj + bi[row];
1833       vworkA = a_a + ai[row];
1834       vworkB = b_a + bi[row];
1835 
1836       /* load the column values for this row into vals*/
1837       vals = sbuf_aa_i + ct2;
1838 
1839       lwrite = 0;
1840       for (l=0; l<nzB; l++) {
1841         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1842       }
1843       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1844       for (l=0; l<nzB; l++) {
1845         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1846       }
1847 
1848       ct2 += ncols;
1849     }
1850     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1851   }
1852 
1853   /* Assemble submat */
1854   /* First assemble the local rows */
1855   for (j=0; j<nrow; j++) {
1856     row  = irow[j];
1857     proc = row2proc[j];
1858     if (proc == rank) {
1859       Crow = row - rstart;  /* local row index of C */
1860 #if defined(PETSC_USE_CTABLE)
1861       row = rmap_loc[Crow]; /* row index of submat */
1862 #else
1863       row = rmap[row];
1864 #endif
1865 
1866       if (allcolumns) {
1867         /* diagonal part A = c->A */
1868         ncols = ai[Crow+1] - ai[Crow];
1869         cols  = aj + ai[Crow];
1870         vals  = a->a + ai[Crow];
1871         i     = 0;
1872         for (k=0; k<ncols; k++) {
1873           subcols[i]   = cols[k] + cstart;
1874           subvals[i++] = vals[k];
1875         }
1876 
1877         /* off-diagonal part B = c->B */
1878         ncols = bi[Crow+1] - bi[Crow];
1879         cols  = bj + bi[Crow];
1880         vals  = b->a + bi[Crow];
1881         for (k=0; k<ncols; k++) {
1882           subcols[i]   = bmap[cols[k]];
1883           subvals[i++] = vals[k];
1884         }
1885 
1886         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1887 
1888       } else { /* !allcolumns */
1889 #if defined(PETSC_USE_CTABLE)
1890         /* diagonal part A = c->A */
1891         ncols = ai[Crow+1] - ai[Crow];
1892         cols  = aj + ai[Crow];
1893         vals  = a->a + ai[Crow];
1894         i     = 0;
1895         for (k=0; k<ncols; k++) {
1896           tcol = cmap_loc[cols[k]];
1897           if (tcol) {
1898             subcols[i]   = --tcol;
1899             subvals[i++] = vals[k];
1900           }
1901         }
1902 
1903         /* off-diagonal part B = c->B */
1904         ncols = bi[Crow+1] - bi[Crow];
1905         cols  = bj + bi[Crow];
1906         vals  = b->a + bi[Crow];
1907         for (k=0; k<ncols; k++) {
1908           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1909           if (tcol) {
1910             subcols[i]   = --tcol;
1911             subvals[i++] = vals[k];
1912           }
1913         }
1914 #else
1915         /* diagonal part A = c->A */
1916         ncols = ai[Crow+1] - ai[Crow];
1917         cols  = aj + ai[Crow];
1918         vals  = a->a + ai[Crow];
1919         i     = 0;
1920         for (k=0; k<ncols; k++) {
1921           tcol = cmap[cols[k]+cstart];
1922           if (tcol) {
1923             subcols[i]   = --tcol;
1924             subvals[i++] = vals[k];
1925           }
1926         }
1927 
1928         /* off-diagonal part B = c->B */
1929         ncols = bi[Crow+1] - bi[Crow];
1930         cols  = bj + bi[Crow];
1931         vals  = b->a + bi[Crow];
1932         for (k=0; k<ncols; k++) {
1933           tcol = cmap[bmap[cols[k]]];
1934           if (tcol) {
1935             subcols[i]   = --tcol;
1936             subvals[i++] = vals[k];
1937           }
1938         }
1939 #endif
1940         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1941       }
1942     }
1943   }
1944 
1945   /* Now assemble the off-proc rows */
1946   for (i=0; i<nrqs; i++) { /* for each requested message */
1947     /* recv values from other processes */
1948     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1949     proc    = pa[idex];
1950     sbuf1_i = sbuf1[proc];
1951     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1952     ct1     = 2 + 1;
1953     ct2     = 0; /* count of received C->j */
1954     ct3     = 0; /* count of received C->j that will be inserted into submat */
1955     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1956     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1957     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1958 
1959     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1960     max1 = sbuf1_i[2];             /* num of rows */
1961     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1962       row = sbuf1_i[ct1]; /* row index of submat */
1963       if (!allcolumns) {
1964         idex = 0;
1965         if (scall == MAT_INITIAL_MATRIX) {
1966           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1967           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1968 #if defined(PETSC_USE_CTABLE)
1969             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1970               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1971             } else {
1972               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1973             }
1974 #else
1975             tcol = cmap[rbuf3_i[ct2]];
1976 #endif
1977             if (tcol) {
1978               subcols[idex]   = --tcol;
1979               subvals[idex++] = rbuf4_i[ct2];
1980 
1981               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1982                For reuse, we replace received C->j with index that should be inserted to submat */
1983               rbuf3_i[ct3++] = ct2;
1984             }
1985           }
1986           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1987 
1988         } else { /* scall == MAT_REUSE_MATRIX */
1989           submat = submats[0];
1990           subc   = (Mat_SeqAIJ*)submat->data;
1991 
1992           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1993           for (l=0; l<nnz; l++) {
1994             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1995             subvals[idex++] = rbuf4_i[ct2];
1996           }
1997 
1998           bj = subc->j + subc->i[row];
1999           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
2000         }
2001       } else { /* allcolumns */
2002         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
2003         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
2004         ct2 += nnz;
2005       }
2006     }
2007   }
2008 
2009   /* sending a->a are done */
2010   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
2011   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
2012 
2013   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2014   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2015   submats[0] = submat;
2016 
2017   /* Restore the indices */
2018   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
2019   if (!allcolumns) {
2020     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
2021   }
2022 
2023   /* Destroy allocated memory */
2024   for (i=0; i<nrqs; ++i) {
2025     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
2026   }
2027   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
2028   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2029   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2030 
2031   if (scall == MAT_INITIAL_MATRIX) {
2032     ierr = PetscFree(lens);CHKERRQ(ierr);
2033     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2034     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2035   }
2036   PetscFunctionReturn(0);
2037 }
2038 
2039 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2040 {
2041   PetscErrorCode ierr;
2042   PetscInt       ncol;
2043   PetscBool      colflag,allcolumns=PETSC_FALSE;
2044 
2045   PetscFunctionBegin;
2046   /* Allocate memory to hold all the submatrices */
2047   if (scall == MAT_INITIAL_MATRIX) {
2048     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
2049   }
2050 
2051   /* Check for special case: each processor gets entire matrix columns */
2052   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
2053   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2054   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2055 
2056   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2057   PetscFunctionReturn(0);
2058 }
2059 
2060 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2061 {
2062   PetscErrorCode ierr;
2063   PetscInt       nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2];
2064   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
2065   Mat_SeqAIJ     *subc;
2066   Mat_SubMat     *smat;
2067 
2068   PetscFunctionBegin;
2069   /* Check for special case: each processor has a single IS */
2070   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
2071     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2072     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
2073     PetscFunctionReturn(0);
2074   }
2075 
2076   /* Collect global wantallmatrix and nstages */
2077   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
2078   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2079   if (!nmax) nmax = 1;
2080 
2081   if (scall == MAT_INITIAL_MATRIX) {
2082     /* Collect global wantallmatrix and nstages */
2083     if (ismax == 1 && C->rmap->N == C->cmap->N) {
2084       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2085       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2086       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2087       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2088       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2089         wantallmatrix = PETSC_TRUE;
2090 
2091         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2092       }
2093     }
2094 
2095     /* Determine the number of stages through which submatrices are done
2096        Each stage will extract nmax submatrices.
2097        nmax is determined by the matrix column dimension.
2098        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2099     */
2100     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2101 
2102     in[0] = -1*(PetscInt)wantallmatrix;
2103     in[1] = nstages;
2104     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2105     wantallmatrix = (PetscBool)(-out[0]);
2106     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2107 
2108   } else { /* MAT_REUSE_MATRIX */
2109     if (ismax) {
2110       subc = (Mat_SeqAIJ*)((*submat)[0]->data);
2111       smat = subc->submatis1;
2112     } else { /* (*submat)[0] is a dummy matrix */
2113       smat = (Mat_SubMat*)((*submat)[0]->spptr);
2114     }
2115     if (!smat) {
2116       /* smat is not generated by MatGetSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2117       wantallmatrix = PETSC_TRUE;
2118     } else if (smat->singleis) {
2119       ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2120       PetscFunctionReturn(0);
2121     } else {
2122       nstages = smat->nstages;
2123     }
2124   }
2125 
2126   if (wantallmatrix) {
2127     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2128     PetscFunctionReturn(0);
2129   }
2130 
2131   /* Allocate memory to hold all the submatrices and dummy submatrices */
2132   if (scall == MAT_INITIAL_MATRIX) {
2133     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2134   }
2135 
2136 #if 0
2137   PetscMPIInt rank;
2138   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)C),&rank);CHKERRQ(ierr);
2139   printf("[%d] reuse %d, ismax %d, CN %d, nstages %d, nmax %d\n",rank,scall,ismax,C->cmap->N,nstages,nmax);
2140 #endif
2141   for (i=0,pos=0; i<nstages; i++) {
2142     if (pos+nmax <= ismax) max_no = nmax;
2143     else if (pos == ismax) max_no = 0;
2144     else                   max_no = ismax-pos;
2145     /* if (!max_no) printf("[%d] max_no=0, %d-stage\n",rank,i); */
2146     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2147     pos += max_no;
2148   }
2149 
2150   if (scall == MAT_INITIAL_MATRIX) {
2151     /* save nstages for reuse */
2152     if (ismax) {
2153       subc = (Mat_SeqAIJ*)((*submat)[0]->data);
2154       smat = subc->submatis1;
2155     } else { /* dummy (*submat)[0] */
2156       smat = (Mat_SubMat*)((*submat)[0]->spptr);
2157     }
2158     if (!smat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"smat does not exit");
2159     smat->nstages = nstages;
2160   }
2161   PetscFunctionReturn(0);
2162 }
2163 
2164 /* -------------------------------------------------------------------------*/
2165 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2166 {
2167   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2168   Mat            A  = c->A;
2169   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2170   const PetscInt **icol,**irow;
2171   PetscInt       *nrow,*ncol,start;
2172   PetscErrorCode ierr;
2173   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2174   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2175   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2176   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2177   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2178 #if defined(PETSC_USE_CTABLE)
2179   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2180 #else
2181   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2182 #endif
2183   const PetscInt *irow_i;
2184   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2185   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2186   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2187   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2188   MPI_Status     *r_status3,*r_status4,*s_status4;
2189   MPI_Comm       comm;
2190   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2191   PetscMPIInt    *onodes1,*olengths1,end;
2192   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2193   Mat_SubMat     *smat_i;
2194   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2195   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2196 
2197   PetscFunctionBegin;
2198   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2199   size = c->size;
2200   rank = c->rank;
2201 
2202   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2203   ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2204 
2205   for (i=0; i<ismax; i++) {
2206     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2207     if (!issorted[i]) iscsorted = issorted[i];
2208 
2209     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2210 
2211     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2212     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2213 
2214     /* Check for special case: allcolumn */
2215     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2216     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2217     if (colflag && ncol[i] == C->cmap->N) {
2218       allcolumns[i] = PETSC_TRUE;
2219       icol[i] = NULL;
2220     } else {
2221       allcolumns[i] = PETSC_FALSE;
2222       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2223     }
2224   }
2225 
2226   if (scall == MAT_REUSE_MATRIX) {
2227     /* Assumes new rows are same length as the old rows */
2228     for (i=0; i<ismax; i++) {
2229       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2230       subc = (Mat_SeqAIJ*)(submats[i]->data);
2231       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2232 
2233       /* Initial matrix as if empty */
2234       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2235 
2236       smat_i   = subc->submatis1;
2237 
2238       nrqs        = smat_i->nrqs;
2239       nrqr        = smat_i->nrqr;
2240       rbuf1       = smat_i->rbuf1;
2241       rbuf2       = smat_i->rbuf2;
2242       rbuf3       = smat_i->rbuf3;
2243       req_source2 = smat_i->req_source2;
2244 
2245       sbuf1     = smat_i->sbuf1;
2246       sbuf2     = smat_i->sbuf2;
2247       ptr       = smat_i->ptr;
2248       tmp       = smat_i->tmp;
2249       ctr       = smat_i->ctr;
2250 
2251       pa          = smat_i->pa;
2252       req_size    = smat_i->req_size;
2253       req_source1 = smat_i->req_source1;
2254 
2255       allcolumns[i] = smat_i->allcolumns;
2256       row2proc[i]   = smat_i->row2proc;
2257       rmap[i]       = smat_i->rmap;
2258       cmap[i]       = smat_i->cmap;
2259     }
2260 
2261     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2262       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2263       smat_i   = (Mat_SubMat*)(submats[0]->spptr);
2264 
2265       nrqs        = smat_i->nrqs;
2266       nrqr        = smat_i->nrqr;
2267       rbuf1       = smat_i->rbuf1;
2268       rbuf2       = smat_i->rbuf2;
2269       rbuf3       = smat_i->rbuf3;
2270       req_source2 = smat_i->req_source2;
2271 
2272       sbuf1       = smat_i->sbuf1;
2273       sbuf2       = smat_i->sbuf2;
2274       ptr         = smat_i->ptr;
2275       tmp         = smat_i->tmp;
2276       ctr         = smat_i->ctr;
2277 
2278       pa          = smat_i->pa;
2279       req_size    = smat_i->req_size;
2280       req_source1 = smat_i->req_source1;
2281 
2282       allcolumns[0] = PETSC_TRUE;
2283     }
2284   } else { /* scall == MAT_INITIAL_MATRIX */
2285     /* Get some new tags to keep the communication clean */
2286     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2287     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2288 
2289     /* evaluate communication - mesg to who, length of mesg, and buffer space
2290      required. Based on this, buffers are allocated, and data copied into them*/
2291     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2292 
2293     for (i=0; i<ismax; i++) {
2294       jmax   = nrow[i];
2295       irow_i = irow[i];
2296 
2297       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2298       row2proc[i] = row2proc_i;
2299 
2300       if (issorted[i]) proc = 0;
2301       for (j=0; j<jmax; j++) {
2302         if (!issorted[i]) proc = 0;
2303         row = irow_i[j];
2304         while (row >= C->rmap->range[proc+1]) proc++;
2305         w4[proc]++;
2306         row2proc_i[j] = proc; /* map row index to proc */
2307       }
2308       for (j=0; j<size; j++) {
2309         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2310       }
2311     }
2312 
2313     nrqs     = 0;              /* no of outgoing messages */
2314     msz      = 0;              /* total mesg length (for all procs) */
2315     w1[rank] = 0;              /* no mesg sent to self */
2316     w3[rank] = 0;
2317     for (i=0; i<size; i++) {
2318       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2319     }
2320     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2321     for (i=0,j=0; i<size; i++) {
2322       if (w1[i]) { pa[j] = i; j++; }
2323     }
2324 
2325     /* Each message would have a header = 1 + 2*(no of IS) + data */
2326     for (i=0; i<nrqs; i++) {
2327       j      = pa[i];
2328       w1[j] += w2[j] + 2* w3[j];
2329       msz   += w1[j];
2330     }
2331     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2332 
2333     /* Determine the number of messages to expect, their lengths, from from-ids */
2334     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2335     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2336 
2337     /* Now post the Irecvs corresponding to these messages */
2338     tag0 = ((PetscObject)C)->tag;
2339     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2340 
2341     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2342     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2343 
2344     /* Allocate Memory for outgoing messages */
2345     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2346     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2347     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2348 
2349     {
2350       PetscInt *iptr = tmp;
2351       k    = 0;
2352       for (i=0; i<nrqs; i++) {
2353         j        = pa[i];
2354         iptr    += k;
2355         sbuf1[j] = iptr;
2356         k        = w1[j];
2357       }
2358     }
2359 
2360     /* Form the outgoing messages. Initialize the header space */
2361     for (i=0; i<nrqs; i++) {
2362       j           = pa[i];
2363       sbuf1[j][0] = 0;
2364       ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2365       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2366     }
2367 
2368     /* Parse the isrow and copy data into outbuf */
2369     for (i=0; i<ismax; i++) {
2370       row2proc_i = row2proc[i];
2371       ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2372       irow_i = irow[i];
2373       jmax   = nrow[i];
2374       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2375         proc = row2proc_i[j];
2376         if (proc != rank) { /* copy to the outgoing buf*/
2377           ctr[proc]++;
2378           *ptr[proc] = irow_i[j];
2379           ptr[proc]++;
2380         }
2381       }
2382       /* Update the headers for the current IS */
2383       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2384         if ((ctr_j = ctr[j])) {
2385           sbuf1_j        = sbuf1[j];
2386           k              = ++sbuf1_j[0];
2387           sbuf1_j[2*k]   = ctr_j;
2388           sbuf1_j[2*k-1] = i;
2389         }
2390       }
2391     }
2392 
2393     /*  Now  post the sends */
2394     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2395     for (i=0; i<nrqs; ++i) {
2396       j    = pa[i];
2397       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2398     }
2399 
2400     /* Post Receives to capture the buffer size */
2401     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2402     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2403     rbuf2[0] = tmp + msz;
2404     for (i=1; i<nrqs; ++i) {
2405       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2406     }
2407     for (i=0; i<nrqs; ++i) {
2408       j    = pa[i];
2409       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2410     }
2411 
2412     /* Send to other procs the buf size they should allocate */
2413     /* Receive messages*/
2414     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2415     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2416     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2417     {
2418       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2419       PetscInt   *sbuf2_i;
2420 
2421       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2422       for (i=0; i<nrqr; ++i) {
2423         req_size[i] = 0;
2424         rbuf1_i        = rbuf1[i];
2425         start          = 2*rbuf1_i[0] + 1;
2426         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2427         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2428         sbuf2_i        = sbuf2[i];
2429         for (j=start; j<end; j++) {
2430           id              = rbuf1_i[j] - rstart;
2431           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2432           sbuf2_i[j]      = ncols;
2433           req_size[i] += ncols;
2434         }
2435         req_source1[i] = r_status1[i].MPI_SOURCE;
2436         /* form the header */
2437         sbuf2_i[0] = req_size[i];
2438         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2439 
2440         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2441       }
2442     }
2443     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2444     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2445     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2446 
2447     /* Receive messages*/
2448     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2449     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2450 
2451     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2452     for (i=0; i<nrqs; ++i) {
2453       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2454       req_source2[i] = r_status2[i].MPI_SOURCE;
2455       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2456     }
2457     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2458     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2459 
2460     /* Wait on sends1 and sends2 */
2461     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2462     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2463 
2464     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2465     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2466     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2467     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2468     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2469     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2470 
2471     /* Now allocate sending buffers for a->j, and send them off */
2472     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2473     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2474     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2475     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2476 
2477     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2478     {
2479       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2480       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2481       PetscInt cend = C->cmap->rend;
2482       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2483 
2484       for (i=0; i<nrqr; i++) {
2485         rbuf1_i   = rbuf1[i];
2486         sbuf_aj_i = sbuf_aj[i];
2487         ct1       = 2*rbuf1_i[0] + 1;
2488         ct2       = 0;
2489         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2490           kmax = rbuf1[i][2*j];
2491           for (k=0; k<kmax; k++,ct1++) {
2492             row    = rbuf1_i[ct1] - rstart;
2493             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2494             ncols  = nzA + nzB;
2495             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2496 
2497             /* load the column indices for this row into cols */
2498             cols = sbuf_aj_i + ct2;
2499 
2500             lwrite = 0;
2501             for (l=0; l<nzB; l++) {
2502               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2503             }
2504             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2505             for (l=0; l<nzB; l++) {
2506               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2507             }
2508 
2509             ct2 += ncols;
2510           }
2511         }
2512         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2513       }
2514     }
2515     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2516 
2517     /* create col map: global col of C -> local col of submatrices */
2518     {
2519       const PetscInt *icol_i;
2520 #if defined(PETSC_USE_CTABLE)
2521       for (i=0; i<ismax; i++) {
2522         if (!allcolumns[i]) {
2523           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2524 
2525           jmax   = ncol[i];
2526           icol_i = icol[i];
2527           cmap_i = cmap[i];
2528           for (j=0; j<jmax; j++) {
2529             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2530           }
2531         } else cmap[i] = NULL;
2532       }
2533 #else
2534       for (i=0; i<ismax; i++) {
2535         if (!allcolumns[i]) {
2536           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2537           jmax   = ncol[i];
2538           icol_i = icol[i];
2539           cmap_i = cmap[i];
2540           for (j=0; j<jmax; j++) {
2541             cmap_i[icol_i[j]] = j+1;
2542           }
2543         } else cmap[i] = NULL;
2544       }
2545 #endif
2546     }
2547 
2548     /* Create lens which is required for MatCreate... */
2549     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2550     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2551 
2552     if (ismax) {
2553       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2554     }
2555     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2556 
2557     /* Update lens from local data */
2558     for (i=0; i<ismax; i++) {
2559       row2proc_i = row2proc[i];
2560       jmax = nrow[i];
2561       if (!allcolumns[i]) cmap_i = cmap[i];
2562       irow_i = irow[i];
2563       lens_i = lens[i];
2564       for (j=0; j<jmax; j++) {
2565         row = irow_i[j];
2566         proc = row2proc_i[j];
2567         if (proc == rank) {
2568           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2569           if (!allcolumns[i]) {
2570             for (k=0; k<ncols; k++) {
2571 #if defined(PETSC_USE_CTABLE)
2572               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2573 #else
2574               tcol = cmap_i[cols[k]];
2575 #endif
2576               if (tcol) lens_i[j]++;
2577             }
2578           } else { /* allcolumns */
2579             lens_i[j] = ncols;
2580           }
2581           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2582         }
2583       }
2584     }
2585 
2586     /* Create row map: global row of C -> local row of submatrices */
2587 #if defined(PETSC_USE_CTABLE)
2588     for (i=0; i<ismax; i++) {
2589       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2590       irow_i = irow[i];
2591       jmax   = nrow[i];
2592       for (j=0; j<jmax; j++) {
2593       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2594       }
2595     }
2596 #else
2597     for (i=0; i<ismax; i++) {
2598       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2599       rmap_i = rmap[i];
2600       irow_i = irow[i];
2601       jmax   = nrow[i];
2602       for (j=0; j<jmax; j++) {
2603         rmap_i[irow_i[j]] = j;
2604       }
2605     }
2606 #endif
2607 
2608     /* Update lens from offproc data */
2609     {
2610       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2611 
2612       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2613       for (tmp2=0; tmp2<nrqs; tmp2++) {
2614         sbuf1_i = sbuf1[pa[tmp2]];
2615         jmax    = sbuf1_i[0];
2616         ct1     = 2*jmax+1;
2617         ct2     = 0;
2618         rbuf2_i = rbuf2[tmp2];
2619         rbuf3_i = rbuf3[tmp2];
2620         for (j=1; j<=jmax; j++) {
2621           is_no  = sbuf1_i[2*j-1];
2622           max1   = sbuf1_i[2*j];
2623           lens_i = lens[is_no];
2624           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2625           rmap_i = rmap[is_no];
2626           for (k=0; k<max1; k++,ct1++) {
2627 #if defined(PETSC_USE_CTABLE)
2628             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2629             row--;
2630             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2631 #else
2632             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2633 #endif
2634             max2 = rbuf2_i[ct1];
2635             for (l=0; l<max2; l++,ct2++) {
2636               if (!allcolumns[is_no]) {
2637 #if defined(PETSC_USE_CTABLE)
2638                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2639 #else
2640                 tcol = cmap_i[rbuf3_i[ct2]];
2641 #endif
2642                 if (tcol) lens_i[row]++;
2643               } else { /* allcolumns */
2644                 lens_i[row]++; /* lens_i[row] += max2 ? */
2645               }
2646             }
2647           }
2648         }
2649       }
2650     }
2651     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2652     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2653     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2654     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2655 
2656     /* Create the submatrices */
2657     for (i=0; i<ismax; i++) {
2658       PetscInt    rbs,cbs;
2659 
2660       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2661       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2662 
2663       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2664       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2665 
2666       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2667       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2668       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2669 
2670       /* create struct Mat_SubMat and attached it to submat */
2671       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2672       subc = (Mat_SeqAIJ*)submats[i]->data;
2673       subc->submatis1 = smat_i;
2674 
2675       smat_i->destroy          = submats[i]->ops->destroy;
2676       submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
2677       submats[i]->factortype   = C->factortype;
2678 
2679       smat_i->id          = i;
2680       smat_i->nrqs        = nrqs;
2681       smat_i->nrqr        = nrqr;
2682       smat_i->rbuf1       = rbuf1;
2683       smat_i->rbuf2       = rbuf2;
2684       smat_i->rbuf3       = rbuf3;
2685       smat_i->sbuf2       = sbuf2;
2686       smat_i->req_source2 = req_source2;
2687 
2688       smat_i->sbuf1       = sbuf1;
2689       smat_i->ptr         = ptr;
2690       smat_i->tmp         = tmp;
2691       smat_i->ctr         = ctr;
2692 
2693       smat_i->pa           = pa;
2694       smat_i->req_size     = req_size;
2695       smat_i->req_source1  = req_source1;
2696 
2697       smat_i->allcolumns  = allcolumns[i];
2698       smat_i->singleis    = PETSC_FALSE;
2699       smat_i->row2proc    = row2proc[i];
2700       smat_i->rmap        = rmap[i];
2701       smat_i->cmap        = cmap[i];
2702     }
2703 
2704     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2705       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2706       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2707       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2708 
2709       /* create struct Mat_SubMat and attached it to submat */
2710       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2711       submats[0]->spptr = (Mat_SubMat*)smat_i;
2712 
2713       smat_i->destroy          = submats[0]->ops->destroy;
2714       submats[0]->ops->destroy = MatDestroy_Dummy_MatGetSubmatrices;
2715       submats[0]->factortype   = C->factortype;
2716 
2717       smat_i->id          = i;
2718       smat_i->nrqs        = nrqs;
2719       smat_i->nrqr        = nrqr;
2720       smat_i->rbuf1       = rbuf1;
2721       smat_i->rbuf2       = rbuf2;
2722       smat_i->rbuf3       = rbuf3;
2723       smat_i->sbuf2       = sbuf2;
2724       smat_i->req_source2 = req_source2;
2725 
2726       smat_i->sbuf1       = sbuf1;
2727       smat_i->ptr         = ptr;
2728       smat_i->tmp         = tmp;
2729       smat_i->ctr         = ctr;
2730 
2731       smat_i->pa           = pa;
2732       smat_i->req_size     = req_size;
2733       smat_i->req_source1  = req_source1;
2734 
2735       smat_i->allcolumns  = PETSC_TRUE;
2736       smat_i->singleis    = PETSC_FALSE;
2737       smat_i->row2proc    = NULL;
2738       smat_i->rmap        = NULL;
2739       smat_i->cmap        = NULL;
2740     }
2741 
2742     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2743     ierr = PetscFree(lens);CHKERRQ(ierr);
2744     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2745     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2746 
2747   } /* endof scall == MAT_INITIAL_MATRIX */
2748 
2749   /* Post recv matrix values */
2750   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2751   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2752   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2753   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2754   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2755   for (i=0; i<nrqs; ++i) {
2756     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2757     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2758   }
2759 
2760   /* Allocate sending buffers for a->a, and send them off */
2761   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2762   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2763   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2764   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2765 
2766   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2767   {
2768     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2769     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2770     PetscInt    cend   = C->cmap->rend;
2771     PetscInt    *b_j   = b->j;
2772     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2773 
2774     for (i=0; i<nrqr; i++) {
2775       rbuf1_i   = rbuf1[i];
2776       sbuf_aa_i = sbuf_aa[i];
2777       ct1       = 2*rbuf1_i[0]+1;
2778       ct2       = 0;
2779       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2780         kmax = rbuf1_i[2*j];
2781         for (k=0; k<kmax; k++,ct1++) {
2782           row    = rbuf1_i[ct1] - rstart;
2783           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2784           ncols  = nzA + nzB;
2785           cworkB = b_j + b_i[row];
2786           vworkA = a_a + a_i[row];
2787           vworkB = b_a + b_i[row];
2788 
2789           /* load the column values for this row into vals*/
2790           vals = sbuf_aa_i+ct2;
2791 
2792           lwrite = 0;
2793           for (l=0; l<nzB; l++) {
2794             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2795           }
2796           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2797           for (l=0; l<nzB; l++) {
2798             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2799           }
2800 
2801           ct2 += ncols;
2802         }
2803       }
2804       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2805     }
2806   }
2807 
2808   /* Assemble the matrices */
2809   /* First assemble the local rows */
2810   for (i=0; i<ismax; i++) {
2811     row2proc_i = row2proc[i];
2812     subc      = (Mat_SeqAIJ*)submats[i]->data;
2813     imat_ilen = subc->ilen;
2814     imat_j    = subc->j;
2815     imat_i    = subc->i;
2816     imat_a    = subc->a;
2817 
2818     if (!allcolumns[i]) cmap_i = cmap[i];
2819     rmap_i = rmap[i];
2820     irow_i = irow[i];
2821     jmax   = nrow[i];
2822     for (j=0; j<jmax; j++) {
2823       row  = irow_i[j];
2824       proc = row2proc_i[j];
2825       if (proc == rank) {
2826         old_row = row;
2827 #if defined(PETSC_USE_CTABLE)
2828         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2829         row--;
2830 #else
2831         row = rmap_i[row];
2832 #endif
2833         ilen_row = imat_ilen[row];
2834         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2835         mat_i    = imat_i[row];
2836         mat_a    = imat_a + mat_i;
2837         mat_j    = imat_j + mat_i;
2838         if (!allcolumns[i]) {
2839           for (k=0; k<ncols; k++) {
2840 #if defined(PETSC_USE_CTABLE)
2841             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2842 #else
2843             tcol = cmap_i[cols[k]];
2844 #endif
2845             if (tcol) {
2846               *mat_j++ = tcol - 1;
2847               *mat_a++ = vals[k];
2848               ilen_row++;
2849             }
2850           }
2851         } else { /* allcolumns */
2852           for (k=0; k<ncols; k++) {
2853             *mat_j++ = cols[k];  /* global col index! */
2854             *mat_a++ = vals[k];
2855             ilen_row++;
2856           }
2857         }
2858         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2859 
2860         imat_ilen[row] = ilen_row;
2861       }
2862     }
2863   }
2864 
2865   /* Now assemble the off proc rows */
2866   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2867   for (tmp2=0; tmp2<nrqs; tmp2++) {
2868     sbuf1_i = sbuf1[pa[tmp2]];
2869     jmax    = sbuf1_i[0];
2870     ct1     = 2*jmax + 1;
2871     ct2     = 0;
2872     rbuf2_i = rbuf2[tmp2];
2873     rbuf3_i = rbuf3[tmp2];
2874     rbuf4_i = rbuf4[tmp2];
2875     for (j=1; j<=jmax; j++) {
2876       is_no     = sbuf1_i[2*j-1];
2877       rmap_i    = rmap[is_no];
2878       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2879       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2880       imat_ilen = subc->ilen;
2881       imat_j    = subc->j;
2882       imat_i    = subc->i;
2883       imat_a    = subc->a;
2884       max1      = sbuf1_i[2*j];
2885       for (k=0; k<max1; k++,ct1++) {
2886         row = sbuf1_i[ct1];
2887 #if defined(PETSC_USE_CTABLE)
2888         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2889         row--;
2890 #else
2891         row = rmap_i[row];
2892 #endif
2893         ilen  = imat_ilen[row];
2894         mat_i = imat_i[row];
2895         mat_a = imat_a + mat_i;
2896         mat_j = imat_j + mat_i;
2897         max2  = rbuf2_i[ct1];
2898         if (!allcolumns[is_no]) {
2899           for (l=0; l<max2; l++,ct2++) {
2900 #if defined(PETSC_USE_CTABLE)
2901             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2902 #else
2903             tcol = cmap_i[rbuf3_i[ct2]];
2904 #endif
2905             if (tcol) {
2906               *mat_j++ = tcol - 1;
2907               *mat_a++ = rbuf4_i[ct2];
2908               ilen++;
2909             }
2910           }
2911         } else { /* allcolumns */
2912           for (l=0; l<max2; l++,ct2++) {
2913             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2914             *mat_a++ = rbuf4_i[ct2];
2915             ilen++;
2916           }
2917         }
2918         imat_ilen[row] = ilen;
2919       }
2920     }
2921   }
2922 
2923   if (!iscsorted) { /* sort column indices of the rows */
2924     for (i=0; i<ismax; i++) {
2925       subc      = (Mat_SeqAIJ*)submats[i]->data;
2926       imat_j    = subc->j;
2927       imat_i    = subc->i;
2928       imat_a    = subc->a;
2929       imat_ilen = subc->ilen;
2930 
2931       if (allcolumns[i]) continue;
2932       jmax = nrow[i];
2933       for (j=0; j<jmax; j++) {
2934         mat_i = imat_i[j];
2935         mat_a = imat_a + mat_i;
2936         mat_j = imat_j + mat_i;
2937         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2938       }
2939     }
2940   }
2941 
2942   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2943   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2944   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2945   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2946   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2947 
2948   /* Restore the indices */
2949   for (i=0; i<ismax; i++) {
2950     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2951     if (!allcolumns[i]) {
2952       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2953     }
2954   }
2955 
2956   for (i=0; i<ismax; i++) {
2957     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2958     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2959   }
2960   if (!ismax) { /* a dummy matrix */
2961     //ierr = MatAssemblyBegin(submats[0],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2962     //ierr = MatAssemblyEnd(submats[0],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2963   }
2964 
2965   /* Destroy allocated memory */
2966   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2967   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2968   ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr);
2969 
2970   for (i=0; i<nrqs; ++i) {
2971     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2972   }
2973   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2974 
2975   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2976   PetscFunctionReturn(0);
2977 }
2978 
2979 /*
2980  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2981  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2982  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2983  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2984  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2985  state, and needs to be "assembled" later by compressing B's column space.
2986 
2987  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2988  Following this call, C->A & C->B have been created, even if empty.
2989  */
2990 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2991 {
2992   /* If making this function public, change the error returned in this function away from _PLIB. */
2993   PetscErrorCode ierr;
2994   Mat_MPIAIJ     *aij;
2995   Mat_SeqAIJ     *Baij;
2996   PetscBool      seqaij,Bdisassembled;
2997   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2998   PetscScalar    v;
2999   const PetscInt *rowindices,*colindices;
3000 
3001   PetscFunctionBegin;
3002   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
3003   if (A) {
3004     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
3005     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
3006     if (rowemb) {
3007       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
3008       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
3009     } else {
3010       if (C->rmap->n != A->rmap->n) {
3011 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
3012       }
3013     }
3014     if (dcolemb) {
3015       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
3016       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
3017     } else {
3018       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
3019     }
3020   }
3021   if (B) {
3022     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
3023     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
3024     if (rowemb) {
3025       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
3026       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
3027     } else {
3028       if (C->rmap->n != B->rmap->n) {
3029 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
3030       }
3031     }
3032     if (ocolemb) {
3033       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
3034       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
3035     } else {
3036       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
3037     }
3038   }
3039 
3040   aij    = (Mat_MPIAIJ*)(C->data);
3041   if (!aij->A) {
3042     /* Mimic parts of MatMPIAIJSetPreallocation() */
3043     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
3044     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
3045     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
3046     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
3047     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
3048   }
3049   if (A) {
3050     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
3051   } else {
3052     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
3053   }
3054   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
3055     /*
3056       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
3057       need to "disassemble" B -- convert it to using C's global indices.
3058       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
3059 
3060       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
3061       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
3062 
3063       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
3064       At least avoid calling MatSetValues() and the implied searches?
3065     */
3066 
3067     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
3068 #if defined(PETSC_USE_CTABLE)
3069       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
3070 #else
3071       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
3072       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
3073       if (aij->B) {
3074         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
3075       }
3076 #endif
3077       ngcol = 0;
3078       if (aij->lvec) {
3079 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
3080       }
3081       if (aij->garray) {
3082 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
3083 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
3084       }
3085       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
3086       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
3087     }
3088     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
3089       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
3090     }
3091     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3092       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
3093     }
3094   }
3095   Bdisassembled = PETSC_FALSE;
3096   if (!aij->B) {
3097     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3098     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3099     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3100     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3101     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3102     Bdisassembled = PETSC_TRUE;
3103   }
3104   if (B) {
3105     Baij = (Mat_SeqAIJ*)(B->data);
3106     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3107       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3108       for (i=0; i<B->rmap->n; i++) {
3109 	nz[i] = Baij->i[i+1] - Baij->i[i];
3110       }
3111       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3112       ierr = PetscFree(nz);CHKERRQ(ierr);
3113     }
3114 
3115     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3116     shift = rend-rstart;
3117     count = 0;
3118     rowindices = NULL;
3119     colindices = NULL;
3120     if (rowemb) {
3121       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3122     }
3123     if (ocolemb) {
3124       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3125     }
3126     for (i=0; i<B->rmap->n; i++) {
3127       PetscInt row;
3128       row = i;
3129       if (rowindices) row = rowindices[i];
3130       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3131 	col  = Baij->j[count];
3132 	if (colindices) col = colindices[col];
3133 	if (Bdisassembled && col>=rstart) col += shift;
3134 	v    = Baij->a[count];
3135 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3136 	++count;
3137       }
3138     }
3139     /* No assembly for aij->B is necessary. */
3140     /* FIXME: set aij->B's nonzerostate correctly. */
3141   } else {
3142     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3143   }
3144   C->preallocated  = PETSC_TRUE;
3145   C->was_assembled = PETSC_FALSE;
3146   C->assembled     = PETSC_FALSE;
3147    /*
3148       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3149       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3150    */
3151   PetscFunctionReturn(0);
3152 }
3153 
3154 /*
3155   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3156  */
3157 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3158 {
3159   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3160 
3161   PetscFunctionBegin;
3162   PetscValidPointer(A,2);
3163   PetscValidPointer(B,3);
3164   /* FIXME: make sure C is assembled */
3165   *A = aij->A;
3166   *B = aij->B;
3167   /* Note that we don't incref *A and *B, so be careful! */
3168   PetscFunctionReturn(0);
3169 }
3170 
3171 /*
3172   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3173   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3174 */
3175 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3176                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3177 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3178 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3179 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3180 {
3181   PetscErrorCode ierr;
3182   PetscMPIInt    isize,flag;
3183   PetscInt       i,ii,cismax,ispar;
3184   Mat            *A,*B;
3185   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3186 
3187   PetscFunctionBegin;
3188   if (!ismax) PetscFunctionReturn(0);
3189 
3190   for (i = 0, cismax = 0; i < ismax; ++i) {
3191     PetscMPIInt isize;
3192     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3193     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3194     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3195     if (isize > 1) ++cismax;
3196   }
3197 
3198   /*
3199      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3200      ispar counts the number of parallel ISs across C's comm.
3201   */
3202   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3203   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3204     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3205     PetscFunctionReturn(0);
3206   }
3207 
3208   /* if (ispar) */
3209   /*
3210     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3211     These are used to extract the off-diag portion of the resulting parallel matrix.
3212     The row IS for the off-diag portion is the same as for the diag portion,
3213     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3214   */
3215   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3216   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3217   for (i = 0, ii = 0; i < ismax; ++i) {
3218     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3219     if (isize > 1) {
3220       /*
3221 	 TODO: This is the part that's ***NOT SCALABLE***.
3222 	 To fix this we need to extract just the indices of C's nonzero columns
3223 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3224 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3225 	 to be done without serializing on the IS list, so, most likely, it is best
3226 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3227       */
3228       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3229       /* Now we have to
3230 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3231 	     were sorted on each rank, concatenated they might no longer be sorted;
3232 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3233 	     indices in the nondecreasing order to the original index positions.
3234 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3235       */
3236       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3237       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3238       ++ii;
3239     }
3240   }
3241   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3242   for (i = 0, ii = 0; i < ismax; ++i) {
3243     PetscInt       j,issize;
3244     const PetscInt *indices;
3245 
3246     /*
3247        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3248      */
3249     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3250     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3251     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3252     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3253     for (j = 1; j < issize; ++j) {
3254       if (indices[j] == indices[j-1]) {
3255 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3256       }
3257     }
3258     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3259 
3260 
3261     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3262     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3263     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3264     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3265     for (j = 1; j < issize; ++j) {
3266       if (indices[j-1] == indices[j]) {
3267 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3268       }
3269     }
3270     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3271     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3272     if (isize > 1) {
3273       cisrow[ii] = isrow[i];
3274       ++ii;
3275     }
3276   }
3277   /*
3278     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3279     array of sequential matrices underlying the resulting parallel matrices.
3280     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3281     contain duplicates.
3282 
3283     There are as many diag matrices as there are original index sets. There are only as many parallel
3284     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3285 
3286     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3287     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3288       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3289       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3290       with A[i] and B[ii] extracted from the corresponding MPI submat.
3291     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3292       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3293       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3294       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3295       values into A[i] and B[ii] sitting inside the corresponding submat.
3296     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3297       A[i], B[ii], AA[i] or BB[ii] matrices.
3298   */
3299   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3300   if (scall == MAT_INITIAL_MATRIX) {
3301     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3302   }
3303 
3304   /* Now obtain the sequential A and B submatrices separately. */
3305   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3306   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3307   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3308 
3309   /*
3310     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3311     matrices A & B have been extracted directly into the parallel matrices containing them, or
3312     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3313     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3314     to have the same sparsity pattern.
3315     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3316     must be constructed for C. This is done by setseqmat(s).
3317   */
3318   for (i = 0, ii = 0; i < ismax; ++i) {
3319     /*
3320        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3321        That way we can avoid sorting and computing permutations when reusing.
3322        To this end:
3323         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3324 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3325 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3326     */
3327     MatStructure pattern;
3328     pattern = DIFFERENT_NONZERO_PATTERN;
3329 
3330     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3331     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3332     if (isize > 1) {
3333       if (scall == MAT_INITIAL_MATRIX) {
3334 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3335 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3336 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3337 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3338 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3339       }
3340       /*
3341 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3342       */
3343       {
3344 	Mat AA,BB;
3345         AA = A[i];
3346         BB = B[ii];
3347 	if (AA || BB) {
3348 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3349 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3350 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3351 	}
3352 
3353         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3354       }
3355       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3356       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3357       ++ii;
3358     } else { /* if (isize == 1) */
3359       if (scall == MAT_REUSE_MATRIX) {
3360         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3361       }
3362       if (isrow_p[i] || iscol_p[i]) {
3363         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3364         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3365 	/* Otherwise A is extracted straight into (*submats)[i]. */
3366 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3367 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3368       } else (*submat)[i] = A[i];
3369     }
3370     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3371     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3372   }
3373   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3374   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3375   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3376   ierr = PetscFree(A);CHKERRQ(ierr);
3377   ierr = MatDestroyMatrices(cismax,&B);CHKERRQ(ierr);
3378   PetscFunctionReturn(0);
3379 }
3380 
3381 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3382 {
3383   PetscErrorCode ierr;
3384 
3385   PetscFunctionBegin;
3386   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3387   PetscFunctionReturn(0);
3388 }
3389