xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 5202f6e616d0d15c2ef6453b4e51aa747f3cddb6)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode   ierr;
52   MPI_Comm         comm;
53   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt  *indices_i,**indices;
57   PetscLayout      rmap;
58   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF          sf;
60   PetscSFNode     *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1038 /*
1039     Every processor gets the entire matrix
1040 */
1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1042 {
1043   Mat            B;
1044   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1045   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1046   PetscErrorCode ierr;
1047   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1048   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1049   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1050   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1051 
1052   PetscFunctionBegin;
1053   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1054   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1055 
1056   if (scall == MAT_INITIAL_MATRIX) {
1057     /* ----------------------------------------------------------------
1058          Tell every processor the number of nonzeros per row
1059     */
1060     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1061     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1062       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1063     }
1064     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1065     for (i=0; i<size; i++) {
1066       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1067       displs[i]     = A->rmap->range[i];
1068     }
1069 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1070     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1071 #else
1072     sendcount = A->rmap->rend - A->rmap->rstart;
1073     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1074 #endif
1075     /* ---------------------------------------------------------------
1076          Create the sequential matrix of the same type as the local block diagonal
1077     */
1078     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1079     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1080     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1081     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1082     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1083     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1084     **Bin = B;
1085     b     = (Mat_SeqAIJ*)B->data;
1086 
1087     /*--------------------------------------------------------------------
1088        Copy my part of matrix column indices over
1089     */
1090     sendcount  = ad->nz + bd->nz;
1091     jsendbuf   = b->j + b->i[rstarts[rank]];
1092     a_jsendbuf = ad->j;
1093     b_jsendbuf = bd->j;
1094     n          = A->rmap->rend - A->rmap->rstart;
1095     cnt        = 0;
1096     for (i=0; i<n; i++) {
1097 
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     sendcount = ad->nz + bd->nz;
1161     sendbuf   = b->a + b->i[rstarts[rank]];
1162     a_sendbuf = ad->a;
1163     b_sendbuf = bd->a;
1164     b_sendj   = bd->j;
1165     n         = A->rmap->rend - A->rmap->rstart;
1166     cnt       = 0;
1167     for (i=0; i<n; i++) {
1168 
1169       /* put in lower diagonal portion */
1170       m = bd->i[i+1] - bd->i[i];
1171       while (m > 0) {
1172         /* is it above diagonal (in bd (compressed) numbering) */
1173         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1174         sendbuf[cnt++] = *b_sendbuf++;
1175         m--;
1176         b_sendj++;
1177       }
1178 
1179       /* put in diagonal portion */
1180       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1181         sendbuf[cnt++] = *a_sendbuf++;
1182       }
1183 
1184       /* put in upper diagonal portion */
1185       while (m-- > 0) {
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         b_sendj++;
1188       }
1189     }
1190     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1191 
1192     /* -----------------------------------------------------------------
1193        Gather all numerical values to all processors
1194     */
1195     if (!recvcounts) {
1196       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1197     }
1198     for (i=0; i<size; i++) {
1199       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1200     }
1201     displs[0] = 0;
1202     for (i=1; i<size; i++) {
1203       displs[i] = displs[i-1] + recvcounts[i-1];
1204     }
1205     recvbuf = b->a;
1206 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1207     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1208 #else
1209     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1210 #endif
1211   }  /* endof (flag == MAT_GET_VALUES) */
1212   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1213 
1214   if (A->symmetric) {
1215     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1216   } else if (A->hermitian) {
1217     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1218   } else if (A->structurally_symmetric) {
1219     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1220   }
1221   PetscFunctionReturn(0);
1222 }
1223 
1224 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1225 {
1226   PetscErrorCode ierr;
1227   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1228   Mat_SubMat     *submatj = c->submatis1;
1229   PetscInt       i;
1230 
1231   PetscFunctionBegin;
1232   if (!submatj->id) { /* delete data that are linked only to submats[id=0] */
1233     ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1234 
1235     for (i=0; i<submatj->nrqr; ++i) {
1236       ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1237     }
1238     ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1239 
1240     if (submatj->rbuf1) {
1241       ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1242       ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1243     }
1244 
1245     for (i=0; i<submatj->nrqs; ++i) {
1246       ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1247     }
1248     ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1249     ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1250   }
1251 
1252 #if defined(PETSC_USE_CTABLE)
1253   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1254   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1255   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1256 #else
1257   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1258 #endif
1259 
1260   if (!submatj->allcolumns) {
1261 #if defined(PETSC_USE_CTABLE)
1262     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1263 #else
1264     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1265 #endif
1266   }
1267   ierr = submatj->destroy(C);CHKERRQ(ierr);
1268   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1269 
1270   ierr = PetscFree(submatj);CHKERRQ(ierr);
1271   PetscFunctionReturn(0);
1272 }
1273 
1274 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1275 {
1276   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1277   Mat            submat,A = c->A,B = c->B;
1278   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1279   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1280   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1281   const PetscInt *icol,*irow;
1282   PetscInt       nrow,ncol,start;
1283   PetscErrorCode ierr;
1284   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1285   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1286   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1287   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1288   PetscInt       *lens,rmax,ncols,*cols,Crow;
1289 #if defined(PETSC_USE_CTABLE)
1290   PetscTable     cmap,rmap;
1291   PetscInt       *cmap_loc,*rmap_loc;
1292 #else
1293   PetscInt       *cmap,*rmap;
1294 #endif
1295   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1296   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1297   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1298   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1299   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1300   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1301   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1302   MPI_Comm       comm;
1303   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1304   PetscMPIInt    *onodes1,*olengths1,idex,end;
1305   Mat_SubMat     *smatis1;
1306   PetscBool      isrowsorted;
1307 
1308   PetscFunctionBegin;
1309   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1310 
1311   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1312   size = c->size;
1313   rank = c->rank;
1314 
1315   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1316   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1317 
1318   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1319   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1320   if (allcolumns) {
1321     icol = NULL;
1322     ncol = C->cmap->N;
1323   } else {
1324     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1325     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1326   }
1327 
1328   if (scall == MAT_INITIAL_MATRIX) {
1329     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1330 
1331     /* Get some new tags to keep the communication clean */
1332     tag1 = ((PetscObject)C)->tag;
1333     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1334     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1335 
1336     /* evaluate communication - mesg to who, length of mesg, and buffer space
1337      required. Based on this, buffers are allocated, and data copied into them */
1338     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1339     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1340 
1341     /* w1[proc] = num of rows owned by proc -- to be requested */
1342     proc = 0;
1343     nrqs = 0; /* num of outgoing messages */
1344     for (j=0; j<nrow; j++) {
1345       row  = irow[j]; /* sorted! */
1346       while (row >= C->rmap->range[proc+1]) proc++;
1347       w1[proc]++;
1348       row2proc[j] = proc; /* map row index to proc */
1349 
1350       if (proc != rank && !w2[proc]) {
1351         w2[proc] = 1; nrqs++;
1352       }
1353     }
1354     w1[rank] = 0;  /* rows owned by self will not be requested */
1355 
1356     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1357     for (proc=0,j=0; proc<size; proc++) {
1358       if (w1[proc]) { pa[j++] = proc;}
1359     }
1360 
1361     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1362     msz = 0;              /* total mesg length (for all procs) */
1363     for (i=0; i<nrqs; i++) {
1364       proc      = pa[i];
1365       w1[proc] += 3;
1366       msz      += w1[proc];
1367     }
1368     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1369 
1370     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1371     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1372     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1373 
1374     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1375        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1376     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1377 
1378     /* Now post the Irecvs corresponding to these messages */
1379     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1380 
1381     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1382     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1383 
1384     /* Allocate Memory for outgoing messages */
1385     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1386     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1387     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1388 
1389     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1390     iptr = tmp;
1391     for (i=0; i<nrqs; i++) {
1392       proc        = pa[i];
1393       sbuf1[proc] = iptr;
1394       iptr       += w1[proc];
1395     }
1396 
1397     /* Form the outgoing messages */
1398     /* Initialize the header space */
1399     for (i=0; i<nrqs; i++) {
1400       proc      = pa[i];
1401       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1402       ptr[proc] = sbuf1[proc] + 3;
1403     }
1404 
1405     /* Parse the isrow and copy data into outbuf */
1406     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1407     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1408       proc = row2proc[j];
1409       if (proc != rank) { /* copy to the outgoing buf*/
1410         *ptr[proc] = irow[j];
1411         ctr[proc]++; ptr[proc]++;
1412       }
1413     }
1414 
1415     /* Update the headers for the current IS */
1416     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1417       if ((ctr_j = ctr[j])) {
1418         sbuf1_j        = sbuf1[j];
1419         k              = ++sbuf1_j[0];
1420         sbuf1_j[2*k]   = ctr_j;
1421         sbuf1_j[2*k-1] = 0;
1422       }
1423     }
1424 
1425     /* Now post the sends */
1426     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1427     for (i=0; i<nrqs; ++i) {
1428       proc = pa[i];
1429       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1430     }
1431 
1432     /* Post Receives to capture the buffer size */
1433     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1434     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1435 
1436     rbuf2[0] = tmp + msz;
1437     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1438 
1439     for (i=0; i<nrqs; ++i) {
1440       proc = pa[i];
1441       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1442     }
1443 
1444     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1445 
1446     /* Send to other procs the buf size they should allocate */
1447     /* Receive messages*/
1448     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1449     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1450 
1451     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1452     for (i=0; i<nrqr; ++i) {
1453       req_size[i] = 0;
1454       rbuf1_i        = rbuf1[i];
1455       start          = 2*rbuf1_i[0] + 1;
1456       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1457       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1458       sbuf2_i        = sbuf2[i];
1459       for (j=start; j<end; j++) {
1460         k            = rbuf1_i[j] - rstart;
1461         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1462         sbuf2_i[j]   = ncols;
1463         req_size[i] += ncols;
1464       }
1465       req_source1[i] = r_status1[i].MPI_SOURCE;
1466 
1467       /* form the header */
1468       sbuf2_i[0] = req_size[i];
1469       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1470 
1471       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1472     }
1473 
1474     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1475     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1476 
1477     /* rbuf2 is received, Post recv column indices a->j */
1478     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1479 
1480     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1481     for (i=0; i<nrqs; ++i) {
1482       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1483       req_source2[i] = r_status2[i].MPI_SOURCE;
1484       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1485     }
1486 
1487     /* Wait on sends1 and sends2 */
1488     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1489     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1490     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1491     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1492 
1493     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1494     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1495 
1496     /* Now allocate sending buffers for a->j, and send them off */
1497     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1498     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1499     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1500     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1501 
1502     for (i=0; i<nrqr; i++) { /* for each requested message */
1503       rbuf1_i   = rbuf1[i];
1504       sbuf_aj_i = sbuf_aj[i];
1505       ct1       = 2*rbuf1_i[0] + 1;
1506       ct2       = 0;
1507       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1508 
1509       kmax = rbuf1[i][2];
1510       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1511         row    = rbuf1_i[ct1] - rstart;
1512         nzA    = ai[row+1] - ai[row];
1513         nzB    = bi[row+1] - bi[row];
1514         ncols  = nzA + nzB;
1515         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1516 
1517         /* load the column indices for this row into cols*/
1518         cols = sbuf_aj_i + ct2;
1519 
1520         lwrite = 0;
1521         for (l=0; l<nzB; l++) {
1522           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1523         }
1524         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1525         for (l=0; l<nzB; l++) {
1526           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1527         }
1528 
1529         ct2 += ncols;
1530       }
1531       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1532     }
1533 
1534     /* create column map (cmap): global col of C -> local col of submat */
1535 #if defined(PETSC_USE_CTABLE)
1536     if (!allcolumns) {
1537       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1538       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1539       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1540         if (icol[j] >= cstart && icol[j] <cend) {
1541           cmap_loc[icol[j] - cstart] = j+1;
1542         } else { /* use PetscTable for non-local col indices */
1543           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1544         }
1545       }
1546     } else {
1547       cmap     = NULL;
1548       cmap_loc = NULL;
1549     }
1550     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1551 #else
1552     if (!allcolumns) {
1553       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1554       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1555     } else {
1556       cmap = NULL;
1557     }
1558 #endif
1559 
1560     /* Create lens for MatSeqAIJSetPreallocation() */
1561     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1562 
1563     /* Compute lens from local part of C */
1564     for (j=0; j<nrow; j++) {
1565       row  = irow[j];
1566       proc = row2proc[j];
1567       if (proc == rank) {
1568         /* diagonal part A = c->A */
1569         ncols = ai[row-rstart+1] - ai[row-rstart];
1570         cols  = aj + ai[row-rstart];
1571         if (!allcolumns) {
1572           for (k=0; k<ncols; k++) {
1573 #if defined(PETSC_USE_CTABLE)
1574             tcol = cmap_loc[cols[k]];
1575 #else
1576             tcol = cmap[cols[k]+cstart];
1577 #endif
1578             if (tcol) lens[j]++;
1579           }
1580         } else { /* allcolumns */
1581           lens[j] = ncols;
1582         }
1583 
1584         /* off-diagonal part B = c->B */
1585         ncols = bi[row-rstart+1] - bi[row-rstart];
1586         cols  = bj + bi[row-rstart];
1587         if (!allcolumns) {
1588           for (k=0; k<ncols; k++) {
1589 #if defined(PETSC_USE_CTABLE)
1590             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1591 #else
1592             tcol = cmap[bmap[cols[k]]];
1593 #endif
1594             if (tcol) lens[j]++;
1595           }
1596         } else { /* allcolumns */
1597           lens[j] += ncols;
1598         }
1599       }
1600     }
1601 
1602     /* Create row map (rmap): global row of C -> local row of submat */
1603 #if defined(PETSC_USE_CTABLE)
1604     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1605     for (j=0; j<nrow; j++) {
1606       row  = irow[j];
1607       proc = row2proc[j];
1608       if (proc == rank) { /* a local row */
1609         rmap_loc[row - rstart] = j;
1610       } else {
1611         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1612       }
1613     }
1614 #else
1615     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1616     for (j=0; j<nrow; j++) {
1617       rmap[irow[j]] = j;
1618     }
1619 #endif
1620 
1621     /* Update lens from offproc data */
1622     /* recv a->j is done */
1623     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1624     for (i=0; i<nrqs; i++) {
1625       proc    = pa[i];
1626       sbuf1_i = sbuf1[proc];
1627       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1628       ct1     = 2 + 1;
1629       ct2     = 0;
1630       rbuf2_i = rbuf2[i]; /* received length of C->j */
1631       rbuf3_i = rbuf3[i]; /* received C->j */
1632 
1633       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1634       max1   = sbuf1_i[2];
1635       for (k=0; k<max1; k++,ct1++) {
1636 #if defined(PETSC_USE_CTABLE)
1637         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1638         row--;
1639         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1640 #else
1641         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1642 #endif
1643         /* Now, store row index of submat in sbuf1_i[ct1] */
1644         sbuf1_i[ct1] = row;
1645 
1646         nnz = rbuf2_i[ct1];
1647         if (!allcolumns) {
1648           for (l=0; l<nnz; l++,ct2++) {
1649 #if defined(PETSC_USE_CTABLE)
1650             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1651               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1652             } else {
1653               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1654             }
1655 #else
1656             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1657 #endif
1658             if (tcol) lens[row]++;
1659           }
1660         } else { /* allcolumns */
1661           lens[row] += nnz;
1662         }
1663       }
1664     }
1665     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1666     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1667 
1668     /* Create the submatrices */
1669     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1670     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1671 
1672     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1673     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1674     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1675     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1676     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1677 
1678     /* create struct Mat_SubMat and attached it to submat */
1679     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1680     subc = (Mat_SeqAIJ*)submat->data;
1681     subc->submatis1 = smatis1;
1682 
1683     smatis1->id          = 0;
1684     smatis1->nrqs        = nrqs;
1685     smatis1->nrqr        = nrqr;
1686     smatis1->rbuf1       = rbuf1;
1687     smatis1->rbuf2       = rbuf2;
1688     smatis1->rbuf3       = rbuf3;
1689     smatis1->sbuf2       = sbuf2;
1690     smatis1->req_source2 = req_source2;
1691 
1692     smatis1->sbuf1       = sbuf1;
1693     smatis1->ptr         = ptr;
1694     smatis1->tmp         = tmp;
1695     smatis1->ctr         = ctr;
1696 
1697     smatis1->pa           = pa;
1698     smatis1->req_size     = req_size;
1699     smatis1->req_source1  = req_source1;
1700 
1701     smatis1->allcolumns  = allcolumns;
1702     smatis1->singleis    = PETSC_TRUE;
1703     smatis1->row2proc    = row2proc;
1704     smatis1->rmap        = rmap;
1705     smatis1->cmap        = cmap;
1706 #if defined(PETSC_USE_CTABLE)
1707     smatis1->rmap_loc    = rmap_loc;
1708     smatis1->cmap_loc    = cmap_loc;
1709 #endif
1710 
1711     smatis1->destroy     = submat->ops->destroy;
1712     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1713     submat->factortype   = C->factortype;
1714 
1715     /* compute rmax */
1716     rmax = 0;
1717     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1718 
1719   } else { /* scall == MAT_REUSE_MATRIX */
1720     submat = submats[0];
1721     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1722 
1723     subc    = (Mat_SeqAIJ*)submat->data;
1724     rmax    = subc->rmax;
1725     smatis1 = subc->submatis1;
1726     nrqs        = smatis1->nrqs;
1727     nrqr        = smatis1->nrqr;
1728     rbuf1       = smatis1->rbuf1;
1729     rbuf2       = smatis1->rbuf2;
1730     rbuf3       = smatis1->rbuf3;
1731     req_source2 = smatis1->req_source2;
1732 
1733     sbuf1     = smatis1->sbuf1;
1734     sbuf2     = smatis1->sbuf2;
1735     ptr       = smatis1->ptr;
1736     tmp       = smatis1->tmp;
1737     ctr       = smatis1->ctr;
1738 
1739     pa         = smatis1->pa;
1740     req_size   = smatis1->req_size;
1741     req_source1 = smatis1->req_source1;
1742 
1743     allcolumns = smatis1->allcolumns;
1744     row2proc   = smatis1->row2proc;
1745     rmap       = smatis1->rmap;
1746     cmap       = smatis1->cmap;
1747 #if defined(PETSC_USE_CTABLE)
1748     rmap_loc   = smatis1->rmap_loc;
1749     cmap_loc   = smatis1->cmap_loc;
1750 #endif
1751   }
1752 
1753   /* Post recv matrix values */
1754   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1755   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1756   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1757   for (i=0; i<nrqs; ++i) {
1758     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1759     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1760   }
1761 
1762   /* Allocate sending buffers for a->a, and send them off */
1763   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1764   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1765   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1766   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1767 
1768   for (i=0; i<nrqr; i++) {
1769     rbuf1_i   = rbuf1[i];
1770     sbuf_aa_i = sbuf_aa[i];
1771     ct1       = 2*rbuf1_i[0]+1;
1772     ct2       = 0;
1773     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1774 
1775     kmax = rbuf1_i[2];
1776     for (k=0; k<kmax; k++,ct1++) {
1777       row = rbuf1_i[ct1] - rstart;
1778       nzA = ai[row+1] - ai[row];
1779       nzB = bi[row+1] - bi[row];
1780       ncols  = nzA + nzB;
1781       cworkB = bj + bi[row];
1782       vworkA = a_a + ai[row];
1783       vworkB = b_a + bi[row];
1784 
1785       /* load the column values for this row into vals*/
1786       vals = sbuf_aa_i + ct2;
1787 
1788       lwrite = 0;
1789       for (l=0; l<nzB; l++) {
1790         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1791       }
1792       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1793       for (l=0; l<nzB; l++) {
1794         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1795       }
1796 
1797       ct2 += ncols;
1798     }
1799     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1800   }
1801 
1802   /* Assemble submat */
1803   /* First assemble the local rows */
1804   for (j=0; j<nrow; j++) {
1805     row  = irow[j];
1806     proc = row2proc[j];
1807     if (proc == rank) {
1808       Crow = row - rstart;  /* local row index of C */
1809 #if defined(PETSC_USE_CTABLE)
1810       row = rmap_loc[Crow]; /* row index of submat */
1811 #else
1812       row = rmap[row];
1813 #endif
1814 
1815       if (allcolumns) {
1816         /* diagonal part A = c->A */
1817         ncols = ai[Crow+1] - ai[Crow];
1818         cols  = aj + ai[Crow];
1819         vals  = a->a + ai[Crow];
1820         i     = 0;
1821         for (k=0; k<ncols; k++) {
1822           subcols[i]   = cols[k] + cstart;
1823           subvals[i++] = vals[k];
1824         }
1825 
1826         /* off-diagonal part B = c->B */
1827         ncols = bi[Crow+1] - bi[Crow];
1828         cols  = bj + bi[Crow];
1829         vals  = b->a + bi[Crow];
1830         for (k=0; k<ncols; k++) {
1831           subcols[i]   = bmap[cols[k]];
1832           subvals[i++] = vals[k];
1833         }
1834 
1835         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1836 
1837       } else { /* !allcolumns */
1838 #if defined(PETSC_USE_CTABLE)
1839         /* diagonal part A = c->A */
1840         ncols = ai[Crow+1] - ai[Crow];
1841         cols  = aj + ai[Crow];
1842         vals  = a->a + ai[Crow];
1843         i     = 0;
1844         for (k=0; k<ncols; k++) {
1845           tcol = cmap_loc[cols[k]];
1846           if (tcol) {
1847             subcols[i]   = --tcol;
1848             subvals[i++] = vals[k];
1849           }
1850         }
1851 
1852         /* off-diagonal part B = c->B */
1853         ncols = bi[Crow+1] - bi[Crow];
1854         cols  = bj + bi[Crow];
1855         vals  = b->a + bi[Crow];
1856         for (k=0; k<ncols; k++) {
1857           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1858           if (tcol) {
1859             subcols[i]   = --tcol;
1860             subvals[i++] = vals[k];
1861           }
1862         }
1863 #else
1864         /* diagonal part A = c->A */
1865         ncols = ai[Crow+1] - ai[Crow];
1866         cols  = aj + ai[Crow];
1867         vals  = a->a + ai[Crow];
1868         i     = 0;
1869         for (k=0; k<ncols; k++) {
1870           tcol = cmap[cols[k]+cstart];
1871           if (tcol) {
1872             subcols[i]   = --tcol;
1873             subvals[i++] = vals[k];
1874           }
1875         }
1876 
1877         /* off-diagonal part B = c->B */
1878         ncols = bi[Crow+1] - bi[Crow];
1879         cols  = bj + bi[Crow];
1880         vals  = b->a + bi[Crow];
1881         for (k=0; k<ncols; k++) {
1882           tcol = cmap[bmap[cols[k]]];
1883           if (tcol) {
1884             subcols[i]   = --tcol;
1885             subvals[i++] = vals[k];
1886           }
1887         }
1888 #endif
1889         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1890       }
1891     }
1892   }
1893 
1894   /* Now assemble the off-proc rows */
1895   for (i=0; i<nrqs; i++) { /* for each requested message */
1896     /* recv values from other processes */
1897     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1898     proc    = pa[idex];
1899     sbuf1_i = sbuf1[proc];
1900     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1901     ct1     = 2 + 1;
1902     ct2     = 0; /* count of received C->j */
1903     ct3     = 0; /* count of received C->j that will be inserted into submat */
1904     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1905     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1906     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1907 
1908     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1909     max1 = sbuf1_i[2];             /* num of rows */
1910     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1911       row = sbuf1_i[ct1]; /* row index of submat */
1912       if (!allcolumns) {
1913         idex = 0;
1914         if (scall == MAT_INITIAL_MATRIX) {
1915           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1916           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1917 #if defined(PETSC_USE_CTABLE)
1918             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1919               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1920             } else {
1921               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1922             }
1923 #else
1924             tcol = cmap[rbuf3_i[ct2]];
1925 #endif
1926             if (tcol) {
1927               subcols[idex]   = --tcol;
1928               subvals[idex++] = rbuf4_i[ct2];
1929 
1930               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1931                For reuse, we replace received C->j with index that should be inserted to submat */
1932               rbuf3_i[ct3++] = ct2;
1933             }
1934           }
1935           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1936 
1937         } else { /* scall == MAT_REUSE_MATRIX */
1938           submat = submats[0];
1939           subc   = (Mat_SeqAIJ*)submat->data;
1940 
1941           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1942           for (l=0; l<nnz; l++) {
1943             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1944             subvals[idex++] = rbuf4_i[ct2];
1945           }
1946 
1947           bj = subc->j + subc->i[row];
1948           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1949         }
1950       } else { /* allcolumns */
1951         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1952         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1953         ct2 += nnz;
1954       }
1955     }
1956   }
1957 
1958   /* sending a->a are done */
1959   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1960   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1961 
1962   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1963   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1964   submats[0] = submat;
1965 
1966   /* Restore the indices */
1967   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1968   if (!allcolumns) {
1969     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1970   }
1971 
1972   /* Destroy allocated memory */
1973   for (i=0; i<nrqs; ++i) {
1974     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1975   }
1976   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1977   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1978   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1979 
1980   if (scall == MAT_INITIAL_MATRIX) {
1981     ierr = PetscFree(lens);CHKERRQ(ierr);
1982     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1983     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1984   }
1985   PetscFunctionReturn(0);
1986 }
1987 
1988 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1989 {
1990   PetscErrorCode ierr;
1991   PetscInt       ncol;
1992   PetscBool      colflag,allcolumns=PETSC_FALSE;
1993 
1994   PetscFunctionBegin;
1995   /* Allocate memory to hold all the submatrices */
1996   if (scall == MAT_INITIAL_MATRIX) {
1997     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
1998   }
1999 
2000   /* Check for special case: each processor gets entire matrix columns */
2001   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
2002   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2003   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2004 
2005   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2006   PetscFunctionReturn(0);
2007 }
2008 
2009 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2010 {
2011   PetscErrorCode ierr;
2012   PetscInt       nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2];
2013   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
2014   Mat_SeqAIJ     *subc;
2015   Mat_SubMat     *smat;
2016 
2017   PetscFunctionBegin;
2018   /* Check for special case: each processor has a single IS */
2019   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
2020     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2021     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
2022     PetscFunctionReturn(0);
2023   }
2024 
2025   //if (scall == MAT_REUSE_MATRIX && !ismax) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"n=0 is not supported for MatGetSubMatrices(mat,n,isrow,iscol,MAT_REUSE_MATRIX,...). Set n=1 with zero-length isrow and iscolumn instead");
2026 
2027   /* Collect global wantallmatrix and nstages */
2028   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
2029   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2030   if (!nmax) nmax = 1;
2031 
2032   if (scall == MAT_INITIAL_MATRIX) {
2033     /* Collect global wantallmatrix and nstages */
2034     if (ismax == 1 && C->rmap->N == C->cmap->N) {
2035       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2036       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2037       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2038       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2039       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2040         wantallmatrix = PETSC_TRUE;
2041 
2042         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2043       }
2044     }
2045 
2046     /* Determine the number of stages through which submatrices are done
2047        Each stage will extract nmax submatrices.
2048        nmax is determined by the matrix column dimension.
2049        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2050     */
2051     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2052 
2053     in[0] = -1*(PetscInt)wantallmatrix;
2054     in[1] = nstages;
2055     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2056     wantallmatrix = (PetscBool)(-out[0]);
2057     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2058 
2059   } else { /* MAT_REUSE_MATRIX */
2060     subc = (Mat_SeqAIJ*)((*submat)[0]->data);
2061     smat   = subc->submatis1;
2062     if (!smat) {
2063       /* smat is not generated by MatGetSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2064       wantallmatrix = PETSC_TRUE;
2065     } else if (smat->singleis) {
2066       ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2067       PetscFunctionReturn(0);
2068     } else {
2069       nstages = smat->nstages;
2070     }
2071   }
2072 
2073   if (wantallmatrix) {
2074     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2075     PetscFunctionReturn(0);
2076   }
2077 
2078   /* Allocate memory to hold all the submatrices */
2079   if (scall == MAT_INITIAL_MATRIX) {
2080     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
2081   }
2082 
2083   PetscMPIInt rank;
2084   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)C),&rank);CHKERRQ(ierr);
2085   printf("[%d] reuse %d, ismax %d, CN %d, nstages %d, nmax %d\n",rank,scall,ismax,C->cmap->N,nstages,nmax);
2086   for (i=0,pos=0; i<nstages; i++) {
2087     if (pos+nmax <= ismax) max_no = nmax;
2088     else if (pos == ismax) max_no = 0;
2089     else                   max_no = ismax-pos;
2090     if (!max_no) printf("[%d] max_no=0, %d-stage\n",rank,i);
2091     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2092     pos += max_no;
2093   }
2094 
2095   if (scall == MAT_INITIAL_MATRIX) {
2096     /* save nstages for reuse */
2097     subc = (Mat_SeqAIJ*)((*submat)[0]->data);
2098     smat = subc->submatis1;
2099     if (!smat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"smat does not exit");
2100     smat->nstages = nstages;
2101   }
2102   PetscFunctionReturn(0);
2103 }
2104 
2105 /* -------------------------------------------------------------------------*/
2106 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2107 {
2108   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2109   Mat            A  = c->A;
2110   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2111   const PetscInt **icol,**irow;
2112   PetscInt       *nrow,*ncol,start;
2113   PetscErrorCode ierr;
2114   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2115   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2116   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2117   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2118   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2119 #if defined(PETSC_USE_CTABLE)
2120   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2121 #else
2122   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2123 #endif
2124   const PetscInt *irow_i;
2125   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2126   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2127   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2128   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2129   MPI_Status     *r_status3,*r_status4,*s_status4;
2130   MPI_Comm       comm;
2131   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2132   PetscMPIInt    *onodes1,*olengths1,end;
2133   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2134   Mat_SubMat     **smats,*smat_i;
2135   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2136   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2137 
2138   PetscFunctionBegin;
2139   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2140   size = c->size;
2141   rank = c->rank;
2142 
2143   ierr = PetscMalloc5(ismax,&smats,ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2144   ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2145 
2146   for (i=0; i<ismax; i++) {
2147     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2148     if (!issorted[i]) iscsorted = issorted[i];
2149 
2150     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2151 
2152     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2153     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2154 
2155     /* Check for special case: allcolumn */
2156     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2157     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2158     if (colflag && ncol[i] == C->cmap->N) {
2159       allcolumns[i] = PETSC_TRUE;
2160       icol[i] = NULL;
2161     } else {
2162       allcolumns[i] = PETSC_FALSE;
2163       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2164     }
2165   }
2166 
2167   if (scall == MAT_REUSE_MATRIX) {
2168     /* Assumes new rows are same length as the old rows */
2169     for (i=0; i<ismax; i++) {
2170       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2171       subc = (Mat_SeqAIJ*)(submats[i]->data);
2172       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2173 
2174       /* Initial matrix as if empty */
2175       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2176 
2177       /* Initial matrix as if empty */
2178       submats[i]->factortype = C->factortype;
2179 
2180       smat_i   = subc->submatis1;
2181       smats[i] = smat_i;
2182 
2183       nrqs        = smat_i->nrqs;
2184       nrqr        = smat_i->nrqr;
2185       rbuf1       = smat_i->rbuf1;
2186       rbuf2       = smat_i->rbuf2;
2187       rbuf3       = smat_i->rbuf3;
2188       req_source2 = smat_i->req_source2;
2189 
2190       sbuf1     = smat_i->sbuf1;
2191       sbuf2     = smat_i->sbuf2;
2192       ptr       = smat_i->ptr;
2193       tmp       = smat_i->tmp;
2194       ctr       = smat_i->ctr;
2195 
2196       pa          = smat_i->pa;
2197       req_size    = smat_i->req_size;
2198       req_source1 = smat_i->req_source1;
2199 
2200       allcolumns[i] = smat_i->allcolumns;
2201       row2proc[i]   = smat_i->row2proc;
2202       rmap[i]       = smat_i->rmap;
2203       cmap[i]       = smat_i->cmap;
2204     }
2205 
2206     if (!ismax){
2207       i = 0;
2208       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2209       subc = (Mat_SeqAIJ*)(submats[i]->data);
2210 
2211       /* Initial matrix as if empty */
2212       ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2213 
2214       /* Initial matrix as if empty */
2215       submats[i]->factortype = C->factortype;
2216       smat_i   = subc->submatis1;
2217 
2218       nrqs        = smat_i->nrqs;
2219       nrqr        = smat_i->nrqr;
2220       rbuf1       = smat_i->rbuf1;
2221       rbuf2       = smat_i->rbuf2;
2222       rbuf3       = smat_i->rbuf3;
2223       req_source2 = smat_i->req_source2;
2224 
2225       sbuf1     = smat_i->sbuf1;
2226       sbuf2     = smat_i->sbuf2;
2227       ptr       = smat_i->ptr;
2228       tmp       = smat_i->tmp;
2229       ctr       = smat_i->ctr;
2230 
2231       pa          = smat_i->pa;
2232       req_size    = smat_i->req_size;
2233       req_source1 = smat_i->req_source1;
2234 
2235       allcolumns[i] = PETSC_TRUE;
2236     }
2237   } else { /* scall == MAT_INITIAL_MATRIX */
2238     /* Get some new tags to keep the communication clean */
2239     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2240     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2241 
2242     /* evaluate communication - mesg to who, length of mesg, and buffer space
2243      required. Based on this, buffers are allocated, and data copied into them*/
2244     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2245 
2246     for (i=0; i<ismax; i++) {
2247       jmax   = nrow[i];
2248       irow_i = irow[i];
2249 
2250       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2251       row2proc[i] = row2proc_i;
2252 
2253       if (issorted[i]) proc = 0;
2254       for (j=0; j<jmax; j++) {
2255         if (!issorted[i]) proc = 0;
2256         row = irow_i[j];
2257         while (row >= C->rmap->range[proc+1]) proc++;
2258         w4[proc]++;
2259         row2proc_i[j] = proc; /* map row index to proc */
2260       }
2261       for (j=0; j<size; j++) {
2262         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2263       }
2264     }
2265 
2266     nrqs     = 0;              /* no of outgoing messages */
2267     msz      = 0;              /* total mesg length (for all procs) */
2268     w1[rank] = 0;              /* no mesg sent to self */
2269     w3[rank] = 0;
2270     for (i=0; i<size; i++) {
2271       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2272     }
2273     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2274     for (i=0,j=0; i<size; i++) {
2275       if (w1[i]) { pa[j] = i; j++; }
2276     }
2277 
2278     /* Each message would have a header = 1 + 2*(no of IS) + data */
2279     for (i=0; i<nrqs; i++) {
2280       j      = pa[i];
2281       w1[j] += w2[j] + 2* w3[j];
2282       msz   += w1[j];
2283     }
2284     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2285 
2286     /* Determine the number of messages to expect, their lengths, from from-ids */
2287     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2288     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2289 
2290     /* Now post the Irecvs corresponding to these messages */
2291     tag0 = ((PetscObject)C)->tag;
2292     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2293 
2294     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2295     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2296 
2297     /* Allocate Memory for outgoing messages */
2298     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2299     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2300     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2301 
2302     {
2303       PetscInt *iptr = tmp;
2304       k    = 0;
2305       for (i=0; i<nrqs; i++) {
2306         j        = pa[i];
2307         iptr    += k;
2308         sbuf1[j] = iptr;
2309         k        = w1[j];
2310       }
2311     }
2312 
2313     /* Form the outgoing messages. Initialize the header space */
2314     for (i=0; i<nrqs; i++) {
2315       j           = pa[i];
2316       sbuf1[j][0] = 0;
2317       ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2318       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2319     }
2320 
2321     /* Parse the isrow and copy data into outbuf */
2322     for (i=0; i<ismax; i++) {
2323       row2proc_i = row2proc[i];
2324       ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2325       irow_i = irow[i];
2326       jmax   = nrow[i];
2327       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2328         proc = row2proc_i[j];
2329         if (proc != rank) { /* copy to the outgoing buf*/
2330           ctr[proc]++;
2331           *ptr[proc] = irow_i[j];
2332           ptr[proc]++;
2333         }
2334       }
2335       /* Update the headers for the current IS */
2336       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2337         if ((ctr_j = ctr[j])) {
2338           sbuf1_j        = sbuf1[j];
2339           k              = ++sbuf1_j[0];
2340           sbuf1_j[2*k]   = ctr_j;
2341           sbuf1_j[2*k-1] = i;
2342         }
2343       }
2344     }
2345 
2346     /*  Now  post the sends */
2347     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2348     for (i=0; i<nrqs; ++i) {
2349       j    = pa[i];
2350       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2351     }
2352 
2353     /* Post Receives to capture the buffer size */
2354     ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2355     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
2356     rbuf2[0] = tmp + msz;
2357     for (i=1; i<nrqs; ++i) {
2358       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2359     }
2360     for (i=0; i<nrqs; ++i) {
2361       j    = pa[i];
2362       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
2363     }
2364 
2365     /* Send to other procs the buf size they should allocate */
2366     /* Receive messages*/
2367     ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2368     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2369     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2370     {
2371       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2372       PetscInt   *sbuf2_i;
2373 
2374       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
2375       for (i=0; i<nrqr; ++i) {
2376         req_size[i] = 0;
2377         rbuf1_i        = rbuf1[i];
2378         start          = 2*rbuf1_i[0] + 1;
2379         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2380         ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
2381         sbuf2_i        = sbuf2[i];
2382         for (j=start; j<end; j++) {
2383           id              = rbuf1_i[j] - rstart;
2384           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2385           sbuf2_i[j]      = ncols;
2386           req_size[i] += ncols;
2387         }
2388         req_source1[i] = r_status1[i].MPI_SOURCE;
2389         /* form the header */
2390         sbuf2_i[0] = req_size[i];
2391         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2392 
2393         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
2394       }
2395     }
2396     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2397     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2398     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2399 
2400     /* Receive messages*/
2401     ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2402     ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2403 
2404     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
2405     for (i=0; i<nrqs; ++i) {
2406       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
2407       req_source2[i] = r_status2[i].MPI_SOURCE;
2408       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
2409     }
2410     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2411     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2412 
2413     /* Wait on sends1 and sends2 */
2414     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2415     ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2416 
2417     if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2418     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2419     ierr = PetscFree(s_status1);CHKERRQ(ierr);
2420     ierr = PetscFree(s_status2);CHKERRQ(ierr);
2421     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2422     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2423 
2424     /* Now allocate sending buffers for a->j, and send them off */
2425     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2426     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2427     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2428     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2429 
2430     ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2431     {
2432       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2433       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2434       PetscInt cend = C->cmap->rend;
2435       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2436 
2437       for (i=0; i<nrqr; i++) {
2438         rbuf1_i   = rbuf1[i];
2439         sbuf_aj_i = sbuf_aj[i];
2440         ct1       = 2*rbuf1_i[0] + 1;
2441         ct2       = 0;
2442         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2443           kmax = rbuf1[i][2*j];
2444           for (k=0; k<kmax; k++,ct1++) {
2445             row    = rbuf1_i[ct1] - rstart;
2446             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2447             ncols  = nzA + nzB;
2448             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2449 
2450             /* load the column indices for this row into cols */
2451             cols = sbuf_aj_i + ct2;
2452 
2453             lwrite = 0;
2454             for (l=0; l<nzB; l++) {
2455               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2456             }
2457             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2458             for (l=0; l<nzB; l++) {
2459               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2460             }
2461 
2462             ct2 += ncols;
2463           }
2464         }
2465         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
2466       }
2467     }
2468     ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
2469 
2470     /* create col map: global col of C -> local col of submatrices */
2471     {
2472       const PetscInt *icol_i;
2473 #if defined(PETSC_USE_CTABLE)
2474       for (i=0; i<ismax; i++) {
2475         if (!allcolumns[i]) {
2476           ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2477 
2478           jmax   = ncol[i];
2479           icol_i = icol[i];
2480           cmap_i = cmap[i];
2481           for (j=0; j<jmax; j++) {
2482             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2483           }
2484         } else cmap[i] = NULL;
2485       }
2486 #else
2487       for (i=0; i<ismax; i++) {
2488         if (!allcolumns[i]) {
2489           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2490           jmax   = ncol[i];
2491           icol_i = icol[i];
2492           cmap_i = cmap[i];
2493           for (j=0; j<jmax; j++) {
2494             cmap_i[icol_i[j]] = j+1;
2495           }
2496         } else cmap[i] = NULL;
2497       }
2498 #endif
2499     }
2500 
2501     /* Create lens which is required for MatCreate... */
2502     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2503     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2504 
2505     if (ismax) {
2506       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2507     }
2508     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2509 
2510     /* Update lens from local data */
2511     for (i=0; i<ismax; i++) {
2512       row2proc_i = row2proc[i];
2513       jmax = nrow[i];
2514       if (!allcolumns[i]) cmap_i = cmap[i];
2515       irow_i = irow[i];
2516       lens_i = lens[i];
2517       for (j=0; j<jmax; j++) {
2518         row = irow_i[j];
2519         proc = row2proc_i[j];
2520         if (proc == rank) {
2521           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2522           if (!allcolumns[i]) {
2523             for (k=0; k<ncols; k++) {
2524 #if defined(PETSC_USE_CTABLE)
2525               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2526 #else
2527               tcol = cmap_i[cols[k]];
2528 #endif
2529               if (tcol) lens_i[j]++;
2530             }
2531           } else { /* allcolumns */
2532             lens_i[j] = ncols;
2533           }
2534           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2535         }
2536       }
2537     }
2538 
2539     /* Create row map: global row of C -> local row of submatrices */
2540 #if defined(PETSC_USE_CTABLE)
2541     for (i=0; i<ismax; i++) {
2542       ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2543       irow_i = irow[i];
2544       jmax   = nrow[i];
2545       for (j=0; j<jmax; j++) {
2546       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2547       }
2548     }
2549 #else
2550     for (i=0; i<ismax; i++) {
2551       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2552       rmap_i = rmap[i];
2553       irow_i = irow[i];
2554       jmax   = nrow[i];
2555       for (j=0; j<jmax; j++) {
2556         rmap_i[irow_i[j]] = j;
2557       }
2558     }
2559 #endif
2560 
2561     /* Update lens from offproc data */
2562     {
2563       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2564 
2565       ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
2566       for (tmp2=0; tmp2<nrqs; tmp2++) {
2567         sbuf1_i = sbuf1[pa[tmp2]];
2568         jmax    = sbuf1_i[0];
2569         ct1     = 2*jmax+1;
2570         ct2     = 0;
2571         rbuf2_i = rbuf2[tmp2];
2572         rbuf3_i = rbuf3[tmp2];
2573         for (j=1; j<=jmax; j++) {
2574           is_no  = sbuf1_i[2*j-1];
2575           max1   = sbuf1_i[2*j];
2576           lens_i = lens[is_no];
2577           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2578           rmap_i = rmap[is_no];
2579           for (k=0; k<max1; k++,ct1++) {
2580 #if defined(PETSC_USE_CTABLE)
2581             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2582             row--;
2583             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2584 #else
2585             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2586 #endif
2587             max2 = rbuf2_i[ct1];
2588             for (l=0; l<max2; l++,ct2++) {
2589               if (!allcolumns[is_no]) {
2590 #if defined(PETSC_USE_CTABLE)
2591                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2592 #else
2593                 tcol = cmap_i[rbuf3_i[ct2]];
2594 #endif
2595                 if (tcol) lens_i[row]++;
2596               } else { /* allcolumns */
2597                 lens_i[row]++; /* lens_i[row] += max2 ? */
2598               }
2599             }
2600           }
2601         }
2602       }
2603     }
2604     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2605     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2606     ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
2607     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2608 
2609     /* Create the submatrices */
2610     for (i=0; i<ismax; i++) {
2611       PetscInt    rbs,cbs;
2612 
2613       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2614       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2615 
2616       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2617       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2618 
2619       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2620       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2621       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2622 
2623       /* create struct Mat_SubMat and attached it to submat */
2624       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2625       subc = (Mat_SeqAIJ*)submats[i]->data;
2626       subc->submatis1 = smat_i;
2627       smats[i]        = smat_i;
2628 
2629       smat_i->destroy          = submats[i]->ops->destroy;
2630       submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
2631       submats[i]->factortype   = C->factortype;
2632 
2633       smat_i->id          = i;
2634       smat_i->nrqs        = nrqs;
2635       smat_i->nrqr        = nrqr;
2636       smat_i->rbuf1       = rbuf1;
2637       smat_i->rbuf2       = rbuf2;
2638       smat_i->rbuf3       = rbuf3;
2639       smat_i->sbuf2       = sbuf2;
2640       smat_i->req_source2 = req_source2;
2641 
2642       smat_i->sbuf1       = sbuf1;
2643       smat_i->ptr         = ptr;
2644       smat_i->tmp         = tmp;
2645       smat_i->ctr         = ctr;
2646 
2647       smat_i->pa           = pa;
2648       smat_i->req_size     = req_size;
2649       smat_i->req_source1  = req_source1;
2650 
2651       smat_i->allcolumns  = allcolumns[i];
2652       smat_i->singleis    = PETSC_FALSE;
2653       smat_i->row2proc    = row2proc[i];
2654       smat_i->rmap        = rmap[i];
2655       smat_i->cmap        = cmap[i];
2656     }
2657 
2658     if (!ismax) { /* Create an empty submats[0] for reuse struct subc */
2659       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2660       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2661       ierr = MatSetType(submats[0],((PetscObject)A)->type_name);CHKERRQ(ierr);
2662       ierr = MatSeqAIJSetPreallocation(submats[0],0,NULL);CHKERRQ(ierr);
2663 
2664       i = 0;
2665       /* create struct Mat_SubMat and attached it to submat */
2666       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2667       subc = (Mat_SeqAIJ*)submats[i]->data;
2668       subc->submatis1 = smat_i;
2669 
2670       smat_i->destroy          = submats[i]->ops->destroy;
2671       submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
2672       submats[i]->factortype   = C->factortype;
2673 
2674       smat_i->id          = i;
2675       smat_i->nrqs        = nrqs;
2676       smat_i->nrqr        = nrqr;
2677       smat_i->rbuf1       = rbuf1;
2678       smat_i->rbuf2       = rbuf2;
2679       smat_i->rbuf3       = rbuf3;
2680       smat_i->sbuf2       = sbuf2;
2681       smat_i->req_source2 = req_source2;
2682 
2683       smat_i->sbuf1       = sbuf1;
2684       smat_i->ptr         = ptr;
2685       smat_i->tmp         = tmp;
2686       smat_i->ctr         = ctr;
2687 
2688       smat_i->pa           = pa;
2689       smat_i->req_size     = req_size;
2690       smat_i->req_source1  = req_source1;
2691 
2692       smat_i->allcolumns  = PETSC_TRUE;
2693       smat_i->singleis    = PETSC_FALSE;
2694       smat_i->row2proc    = NULL;
2695       smat_i->rmap        = NULL;
2696       smat_i->cmap        = NULL;
2697     }
2698 
2699     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2700     ierr = PetscFree(lens);CHKERRQ(ierr);
2701     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2702     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2703 
2704   } /* endof scall == MAT_INITIAL_MATRIX */
2705 
2706   /* Post recv matrix values */
2707   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2708   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2709   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2710   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2711   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2712   for (i=0; i<nrqs; ++i) {
2713     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
2714     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
2715   }
2716 
2717   /* Allocate sending buffers for a->a, and send them off */
2718   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2719   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2720   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2721   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2722 
2723   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2724   {
2725     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2726     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2727     PetscInt    cend   = C->cmap->rend;
2728     PetscInt    *b_j   = b->j;
2729     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2730 
2731     for (i=0; i<nrqr; i++) {
2732       rbuf1_i   = rbuf1[i];
2733       sbuf_aa_i = sbuf_aa[i];
2734       ct1       = 2*rbuf1_i[0]+1;
2735       ct2       = 0;
2736       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2737         kmax = rbuf1_i[2*j];
2738         for (k=0; k<kmax; k++,ct1++) {
2739           row    = rbuf1_i[ct1] - rstart;
2740           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2741           ncols  = nzA + nzB;
2742           cworkB = b_j + b_i[row];
2743           vworkA = a_a + a_i[row];
2744           vworkB = b_a + b_i[row];
2745 
2746           /* load the column values for this row into vals*/
2747           vals = sbuf_aa_i+ct2;
2748 
2749           lwrite = 0;
2750           for (l=0; l<nzB; l++) {
2751             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2752           }
2753           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2754           for (l=0; l<nzB; l++) {
2755             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2756           }
2757 
2758           ct2 += ncols;
2759         }
2760       }
2761       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
2762     }
2763   }
2764 
2765   /* Assemble the matrices */
2766   /* First assemble the local rows */
2767   for (i=0; i<ismax; i++) {
2768     row2proc_i = row2proc[i];
2769     subc      = (Mat_SeqAIJ*)submats[i]->data;
2770     imat_ilen = subc->ilen;
2771     imat_j    = subc->j;
2772     imat_i    = subc->i;
2773     imat_a    = subc->a;
2774 
2775     if (!allcolumns[i]) cmap_i = cmap[i];
2776     rmap_i = rmap[i];
2777     irow_i = irow[i];
2778     jmax   = nrow[i];
2779     for (j=0; j<jmax; j++) {
2780       row  = irow_i[j];
2781       proc = row2proc_i[j];
2782       if (proc == rank) {
2783         old_row = row;
2784 #if defined(PETSC_USE_CTABLE)
2785         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2786         row--;
2787 #else
2788         row = rmap_i[row];
2789 #endif
2790         ilen_row = imat_ilen[row];
2791         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2792         mat_i    = imat_i[row];
2793         mat_a    = imat_a + mat_i;
2794         mat_j    = imat_j + mat_i;
2795         if (!allcolumns[i]) {
2796           for (k=0; k<ncols; k++) {
2797 #if defined(PETSC_USE_CTABLE)
2798             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2799 #else
2800             tcol = cmap_i[cols[k]];
2801 #endif
2802             if (tcol) {
2803               *mat_j++ = tcol - 1;
2804               *mat_a++ = vals[k];
2805               ilen_row++;
2806             }
2807           }
2808         } else { /* allcolumns */
2809           for (k=0; k<ncols; k++) {
2810             *mat_j++ = cols[k];  /* global col index! */
2811             *mat_a++ = vals[k];
2812             ilen_row++;
2813           }
2814         }
2815         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2816 
2817         imat_ilen[row] = ilen_row;
2818       }
2819     }
2820   }
2821 
2822   /* Now assemble the off proc rows */
2823   ierr    = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
2824   for (tmp2=0; tmp2<nrqs; tmp2++) {
2825     sbuf1_i = sbuf1[pa[tmp2]];
2826     jmax    = sbuf1_i[0];
2827     ct1     = 2*jmax + 1;
2828     ct2     = 0;
2829     rbuf2_i = rbuf2[tmp2];
2830     rbuf3_i = rbuf3[tmp2];
2831     rbuf4_i = rbuf4[tmp2];
2832     for (j=1; j<=jmax; j++) {
2833       is_no     = sbuf1_i[2*j-1];
2834       rmap_i    = rmap[is_no];
2835       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2836       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2837       imat_ilen = subc->ilen;
2838       imat_j    = subc->j;
2839       imat_i    = subc->i;
2840       imat_a    = subc->a;
2841       max1      = sbuf1_i[2*j];
2842       for (k=0; k<max1; k++,ct1++) {
2843         row = sbuf1_i[ct1];
2844 #if defined(PETSC_USE_CTABLE)
2845         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2846         row--;
2847 #else
2848         row = rmap_i[row];
2849 #endif
2850         ilen  = imat_ilen[row];
2851         mat_i = imat_i[row];
2852         mat_a = imat_a + mat_i;
2853         mat_j = imat_j + mat_i;
2854         max2  = rbuf2_i[ct1];
2855         if (!allcolumns[is_no]) {
2856           for (l=0; l<max2; l++,ct2++) {
2857 #if defined(PETSC_USE_CTABLE)
2858             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2859 #else
2860             tcol = cmap_i[rbuf3_i[ct2]];
2861 #endif
2862             if (tcol) {
2863               *mat_j++ = tcol - 1;
2864               *mat_a++ = rbuf4_i[ct2];
2865               ilen++;
2866             }
2867           }
2868         } else { /* allcolumns */
2869           for (l=0; l<max2; l++,ct2++) {
2870             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2871             *mat_a++ = rbuf4_i[ct2];
2872             ilen++;
2873           }
2874         }
2875         imat_ilen[row] = ilen;
2876       }
2877     }
2878   }
2879 
2880   if (!iscsorted) { /* sort column indices of the rows */
2881     for (i=0; i<ismax; i++) {
2882       subc       = (Mat_SeqAIJ*)submats[i]->data;
2883       imat_j    = subc->j;
2884       imat_i    = subc->i;
2885       imat_a    = subc->a;
2886       imat_ilen = subc->ilen;
2887 
2888       if (allcolumns[i]) continue;
2889       jmax = nrow[i];
2890       for (j=0; j<jmax; j++) {
2891         PetscInt ilen;
2892 
2893         mat_i = imat_i[j];
2894         mat_a = imat_a + mat_i;
2895         mat_j = imat_j + mat_i;
2896         ilen  = imat_ilen[j];
2897         ierr  = PetscSortIntWithScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
2898       }
2899     }
2900   }
2901 
2902   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2903   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2904   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2905   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2906   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2907 
2908   /* Restore the indices */
2909   for (i=0; i<ismax; i++) {
2910     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2911     if (!allcolumns[i]) {
2912       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2913     }
2914   }
2915 
2916   for (i=0; i<ismax; i++) {
2917     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2918     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2919   }
2920   if (!ismax) { /* an empty matrix */
2921     ierr = MatAssemblyBegin(submats[0],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2922     ierr = MatAssemblyEnd(submats[0],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2923   }
2924 
2925   /* Destroy allocated memory */
2926   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2927   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2928   ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr);
2929 
2930   for (i=0; i<nrqs; ++i) {
2931     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2932   }
2933   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2934 
2935   ierr = PetscFree5(smats,row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2936   PetscFunctionReturn(0);
2937 }
2938 
2939 /*
2940  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2941  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2942  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2943  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2944  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2945  state, and needs to be "assembled" later by compressing B's column space.
2946 
2947  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2948  Following this call, C->A & C->B have been created, even if empty.
2949  */
2950 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2951 {
2952   /* If making this function public, change the error returned in this function away from _PLIB. */
2953   PetscErrorCode ierr;
2954   Mat_MPIAIJ     *aij;
2955   Mat_SeqAIJ     *Baij;
2956   PetscBool      seqaij,Bdisassembled;
2957   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2958   PetscScalar    v;
2959   const PetscInt *rowindices,*colindices;
2960 
2961   PetscFunctionBegin;
2962   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2963   if (A) {
2964     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2965     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2966     if (rowemb) {
2967       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2968       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2969     } else {
2970       if (C->rmap->n != A->rmap->n) {
2971 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2972       }
2973     }
2974     if (dcolemb) {
2975       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2976       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2977     } else {
2978       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2979     }
2980   }
2981   if (B) {
2982     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2983     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2984     if (rowemb) {
2985       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2986       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2987     } else {
2988       if (C->rmap->n != B->rmap->n) {
2989 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2990       }
2991     }
2992     if (ocolemb) {
2993       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2994       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2995     } else {
2996       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2997     }
2998   }
2999 
3000   aij    = (Mat_MPIAIJ*)(C->data);
3001   if (!aij->A) {
3002     /* Mimic parts of MatMPIAIJSetPreallocation() */
3003     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
3004     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
3005     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
3006     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
3007     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
3008   }
3009   if (A) {
3010     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
3011   } else {
3012     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
3013   }
3014   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
3015     /*
3016       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
3017       need to "disassemble" B -- convert it to using C's global indices.
3018       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
3019 
3020       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
3021       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
3022 
3023       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
3024       At least avoid calling MatSetValues() and the implied searches?
3025     */
3026 
3027     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
3028 #if defined(PETSC_USE_CTABLE)
3029       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
3030 #else
3031       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
3032       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
3033       if (aij->B) {
3034         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
3035       }
3036 #endif
3037       ngcol = 0;
3038       if (aij->lvec) {
3039 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
3040       }
3041       if (aij->garray) {
3042 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
3043 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
3044       }
3045       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
3046       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
3047     }
3048     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
3049       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
3050     }
3051     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3052       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
3053     }
3054   }
3055   Bdisassembled = PETSC_FALSE;
3056   if (!aij->B) {
3057     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3058     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3059     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3060     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3061     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3062     Bdisassembled = PETSC_TRUE;
3063   }
3064   if (B) {
3065     Baij = (Mat_SeqAIJ*)(B->data);
3066     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3067       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3068       for (i=0; i<B->rmap->n; i++) {
3069 	nz[i] = Baij->i[i+1] - Baij->i[i];
3070       }
3071       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3072       ierr = PetscFree(nz);CHKERRQ(ierr);
3073     }
3074 
3075     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3076     shift = rend-rstart;
3077     count = 0;
3078     rowindices = NULL;
3079     colindices = NULL;
3080     if (rowemb) {
3081       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3082     }
3083     if (ocolemb) {
3084       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3085     }
3086     for (i=0; i<B->rmap->n; i++) {
3087       PetscInt row;
3088       row = i;
3089       if (rowindices) row = rowindices[i];
3090       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3091 	col  = Baij->j[count];
3092 	if (colindices) col = colindices[col];
3093 	if (Bdisassembled && col>=rstart) col += shift;
3094 	v    = Baij->a[count];
3095 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3096 	++count;
3097       }
3098     }
3099     /* No assembly for aij->B is necessary. */
3100     /* FIXME: set aij->B's nonzerostate correctly. */
3101   } else {
3102     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3103   }
3104   C->preallocated  = PETSC_TRUE;
3105   C->was_assembled = PETSC_FALSE;
3106   C->assembled     = PETSC_FALSE;
3107    /*
3108       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3109       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3110    */
3111   PetscFunctionReturn(0);
3112 }
3113 
3114 /*
3115   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3116  */
3117 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3118 {
3119   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3120 
3121   PetscFunctionBegin;
3122   PetscValidPointer(A,2);
3123   PetscValidPointer(B,3);
3124   /* FIXME: make sure C is assembled */
3125   *A = aij->A;
3126   *B = aij->B;
3127   /* Note that we don't incref *A and *B, so be careful! */
3128   PetscFunctionReturn(0);
3129 }
3130 
3131 /*
3132   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3133   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3134 */
3135 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3136                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3137 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3138 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3139 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3140 {
3141   PetscErrorCode ierr;
3142   PetscMPIInt    isize,flag;
3143   PetscInt       i,ii,cismax,ispar;
3144   Mat            *A,*B;
3145   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3146 
3147   PetscFunctionBegin;
3148   if (!ismax) PetscFunctionReturn(0);
3149 
3150   for (i = 0, cismax = 0; i < ismax; ++i) {
3151     PetscMPIInt isize;
3152     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3153     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3154     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3155     if (isize > 1) ++cismax;
3156   }
3157 
3158   /*
3159      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3160      ispar counts the number of parallel ISs across C's comm.
3161   */
3162   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3163   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3164     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3165     PetscFunctionReturn(0);
3166   }
3167 
3168   /* if (ispar) */
3169   /*
3170     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3171     These are used to extract the off-diag portion of the resulting parallel matrix.
3172     The row IS for the off-diag portion is the same as for the diag portion,
3173     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3174   */
3175   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3176   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3177   for (i = 0, ii = 0; i < ismax; ++i) {
3178     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3179     if (isize > 1) {
3180       /*
3181 	 TODO: This is the part that's ***NOT SCALABLE***.
3182 	 To fix this we need to extract just the indices of C's nonzero columns
3183 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3184 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3185 	 to be done without serializing on the IS list, so, most likely, it is best
3186 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3187       */
3188       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3189       /* Now we have to
3190 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3191 	     were sorted on each rank, concatenated they might no longer be sorted;
3192 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3193 	     indices in the nondecreasing order to the original index positions.
3194 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3195       */
3196       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3197       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3198       ++ii;
3199     }
3200   }
3201   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3202   for (i = 0, ii = 0; i < ismax; ++i) {
3203     PetscInt       j,issize;
3204     const PetscInt *indices;
3205 
3206     /*
3207        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3208      */
3209     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3210     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3211     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3212     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3213     for (j = 1; j < issize; ++j) {
3214       if (indices[j] == indices[j-1]) {
3215 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3216       }
3217     }
3218     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3219 
3220 
3221     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3222     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3223     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3224     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3225     for (j = 1; j < issize; ++j) {
3226       if (indices[j-1] == indices[j]) {
3227 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3228       }
3229     }
3230     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3231     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3232     if (isize > 1) {
3233       cisrow[ii] = isrow[i];
3234       ++ii;
3235     }
3236   }
3237   /*
3238     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3239     array of sequential matrices underlying the resulting parallel matrices.
3240     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3241     contain duplicates.
3242 
3243     There are as many diag matrices as there are original index sets. There are only as many parallel
3244     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3245 
3246     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3247     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3248       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3249       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3250       with A[i] and B[ii] extracted from the corresponding MPI submat.
3251     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3252       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3253       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3254       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3255       values into A[i] and B[ii] sitting inside the corresponding submat.
3256     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3257       A[i], B[ii], AA[i] or BB[ii] matrices.
3258   */
3259   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3260   if (scall == MAT_INITIAL_MATRIX) {
3261     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3262   }
3263 
3264   /* Now obtain the sequential A and B submatrices separately. */
3265   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3266   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3267   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3268 
3269   /*
3270     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3271     matrices A & B have been extracted directly into the parallel matrices containing them, or
3272     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3273     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3274     to have the same sparsity pattern.
3275     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3276     must be constructed for C. This is done by setseqmat(s).
3277   */
3278   for (i = 0, ii = 0; i < ismax; ++i) {
3279     /*
3280        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3281        That way we can avoid sorting and computing permutations when reusing.
3282        To this end:
3283         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3284 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3285 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3286     */
3287     MatStructure pattern;
3288     pattern = DIFFERENT_NONZERO_PATTERN;
3289 
3290     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3291     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3292     if (isize > 1) {
3293       if (scall == MAT_INITIAL_MATRIX) {
3294 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3295 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3296 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3297 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3298 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3299       }
3300       /*
3301 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3302       */
3303       {
3304 	Mat AA,BB;
3305         AA = A[i];
3306         BB = B[ii];
3307 	if (AA || BB) {
3308 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3309 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3310 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3311 	}
3312 
3313         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3314         ierr = MatDestroy(&BB);CHKERRQ(ierr);
3315       }
3316       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3317       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3318       ++ii;
3319     } else { /* if (isize == 1) */
3320       if (scall == MAT_REUSE_MATRIX) {
3321         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3322       }
3323       if (isrow_p[i] || iscol_p[i]) {
3324         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3325         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3326 	/* Otherwise A is extracted straight into (*submats)[i]. */
3327 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3328 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3329       } else (*submat)[i] = A[i];
3330     }
3331     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3332     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3333   }
3334   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3335   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3336   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3337   ierr = PetscFree(A);CHKERRQ(ierr);
3338   ierr = PetscFree(B);CHKERRQ(ierr);
3339   PetscFunctionReturn(0);
3340 }
3341 
3342 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3343 {
3344   PetscErrorCode ierr;
3345 
3346   PetscFunctionBegin;
3347   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3348   PetscFunctionReturn(0);
3349 }
3350