xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision fbfcfee5110779e3d6a9465ca0a2e0f9a1a6e5e3)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23 {
24   PetscErrorCode ierr;
25   PetscInt       i;
26 
27   PetscFunctionBegin;
28   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29   for (i=0; i<ov; ++i) {
30     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
31   }
32   PetscFunctionReturn(0);
33 }
34 
35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36 {
37   PetscErrorCode ierr;
38   PetscInt       i;
39 
40   PetscFunctionBegin;
41   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42   for (i=0; i<ov; ++i) {
43     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
44   }
45   PetscFunctionReturn(0);
46 }
47 
48 
49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50 {
51   PetscErrorCode   ierr;
52   MPI_Comm         comm;
53   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
56   const PetscInt  *indices_i,**indices;
57   PetscLayout      rmap;
58   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
59   PetscSF          sf;
60   PetscSFNode     *remote;
61 
62   PetscFunctionBegin;
63   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
64   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
65   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
66   /* get row map to determine where rows should be going */
67   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
68   /* retrieve IS data and put all together so that we
69    * can optimize communication
70    *  */
71   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
72   for (i=0,tlength=0; i<nidx; i++){
73     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
74     tlength += length[i];
75     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
76   }
77   /* find these rows on remote processors */
78   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
79   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
80   nrrows = 0;
81   for (i=0; i<nidx; i++){
82     length_i     = length[i];
83     indices_i    = indices[i];
84     for (j=0; j<length_i; j++){
85       owner = -1;
86       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
87       /* remote processors */
88       if (owner != rank){
89         tosizes_temp[owner]++; /* number of rows to owner */
90         rrow_ranks[nrrows]  = owner; /* processor */
91         rrow_isids[nrrows]   = i; /* is id */
92         remoterows[nrrows++] = indices_i[j]; /* row */
93       }
94     }
95     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
96   }
97   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
98   /* test if we need to exchange messages
99    * generally speaking, we do not need to exchange
100    * data when overlap is 1
101    * */
102   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
103   /* we do not have any messages
104    * It usually corresponds to overlap 1
105    * */
106   if (!reducednrrows){
107     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
108     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
109     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112   nto = 0;
113   /* send sizes and ranks for building a two-sided communcation */
114   for (i=0; i<size; i++){
115    if (tosizes_temp[i]){
116      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117      tosizes_temp[i] = nto; /* a map from processor to index */
118      toranks[nto++]  = i; /* processor */
119    }
120   }
121   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
122   for (i=0; i<nto; i++){
123     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125   }
126   /* send information to other processors */
127   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
128   nrecvrows = 0;
129   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
131   nrecvrows = 0;
132   for (i=0; i<nfrom; i++){
133     for (j=0; j<fromsizes[2*i]; j++){
134       remote[nrecvrows].rank    = fromranks[i];
135       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136     }
137   }
138   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
139   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
140   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
142   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
143   /* message pair <no of is, row>  */
144   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
145   for (i=0; i<nrrows; i++){
146     owner = rrow_ranks[i]; /* processor */
147     j     = tosizes_temp[owner]; /* index */
148     todata[toffsets[j]++] = rrow_isids[i];
149     todata[toffsets[j]++] = remoterows[i];
150   }
151   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
152   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
153   ierr = PetscFree(toffsets);CHKERRQ(ierr);
154   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
155   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
156   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
157   /* send rows belonging to the remote so that then we could get the overlapping data back */
158   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
159   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
160   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
161   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
162   ierr = PetscFree(fromranks);CHKERRQ(ierr);
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
166   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
167   nrecvrows = 0;
168   for (i=0; i<nto; i++){
169     for (j=0; j<tosizes[2*i]; j++){
170       remote[nrecvrows].rank    = toranks[i];
171       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172     }
173   }
174   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
175   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
176   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
178   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
179   /* overlap communication and computation */
180   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
181   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
182   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
183   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
184   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
185   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
186   ierr = PetscFree(toranks);CHKERRQ(ierr);
187   ierr = PetscFree(tosizes);CHKERRQ(ierr);
188   ierr = PetscFree(todata);CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193 {
194   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196   const PetscInt   *indices_i_temp;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++){
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++){
209     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
210     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
211     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
212     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
213   }
214   /* retrieve information to get row id and its overlap */
215   for (i=0; i<nrecvs; ){
216     is_id      = recvdata[i++];
217     data_size  = recvdata[i++];
218     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219     isz_i      = isz[is_id];
220     for (j=0; j< data_size; j++){
221       col = recvdata[i++];
222       indices_i[isz_i++] = col;
223     }
224     isz[is_id] = isz_i;
225   }
226   /* remove duplicate entities */
227   for (i=0; i<nidx; i++){
228     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229     isz_i      = isz[i];
230     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
231     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
232   }
233   ierr = PetscFree(isz);CHKERRQ(ierr);
234   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239 {
240   PetscLayout       rmap,cmap;
241   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245   Mat               amat,bmat;
246   PetscMPIInt       rank;
247   PetscBool         done;
248   MPI_Comm          comm;
249   PetscErrorCode    ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
253   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
254   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
255   /* Even if the mat is symmetric, we still assume it is not symmetric */
256   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
257   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   /* total number of nonzero values is used to estimate the memory usage in the next step */
261   tnz  = ai[an]+bi[bn];
262   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
263   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
264   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
265   /* to find the longest message */
266   max_fszs = 0;
267   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268   /* better way to estimate number of nonzero in the mat??? */
269   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
270   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271   rows_pos  = 0;
272   totalrows = 0;
273   for (i=0; i<nfrom; i++){
274     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
275     /* group data together */
276     for (j=0; j<fromsizes[2*i]; j+=2){
277       is_id                       = fromrows[rows_pos++];/* no of is */
278       rows_i                      = rows_data[is_id];
279       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280     }
281     /* estimate a space to avoid multiple allocations  */
282     for (j=0; j<nidx; j++){
283       indvc_ij = 0;
284       rows_i   = rows_data[j];
285       for (l=0; l<rows_pos_i[j]; l++){
286         row    = rows_i[l]-rstart;
287         start  = ai[row];
288         end    = ai[row+1];
289         for (k=start; k<end; k++){ /* Amat */
290           col = aj[k] + cstart;
291           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292         }
293         start = bi[row];
294         end   = bi[row+1];
295         for (k=start; k<end; k++) { /* Bmat */
296           col = gcols[bj[k]];
297           indices_tmp[indvc_ij++] = col;
298         }
299       }
300       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
301       indv_counts[i*nidx+j] = indvc_ij;
302       totalrows            += indvc_ij;
303     }
304   }
305   /* message triple <no of is, number of rows, rows> */
306   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
307   totalrows = 0;
308   rows_pos  = 0;
309   /* use this code again */
310   for (i=0;i<nfrom;i++){
311     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
312     for (j=0; j<fromsizes[2*i]; j+=2){
313       is_id                       = fromrows[rows_pos++];
314       rows_i                      = rows_data[is_id];
315       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316     }
317     /* add data  */
318     for (j=0; j<nidx; j++){
319       if (!indv_counts[i*nidx+j]) continue;
320       indvc_ij = 0;
321       sbdata[totalrows++] = j;
322       sbdata[totalrows++] = indv_counts[i*nidx+j];
323       sbsizes[2*i]       += 2;
324       rows_i              = rows_data[j];
325       for (l=0; l<rows_pos_i[j]; l++){
326         row   = rows_i[l]-rstart;
327         start = ai[row];
328         end   = ai[row+1];
329         for (k=start; k<end; k++){ /* Amat */
330           col = aj[k] + cstart;
331           indices_tmp[indvc_ij++] = col;
332         }
333         start = bi[row];
334         end   = bi[row+1];
335         for (k=start; k<end; k++) { /* Bmat */
336           col = gcols[bj[k]];
337           indices_tmp[indvc_ij++] = col;
338         }
339       }
340       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
341       sbsizes[2*i]  += indvc_ij;
342       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
343       totalrows += indvc_ij;
344     }
345   }
346   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
347   for (i=0; i<nfrom; i++){
348     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349     sbsizes[2*i+1] = offsets[i];
350   }
351   ierr = PetscFree(offsets);CHKERRQ(ierr);
352   if (sbrowsizes) *sbrowsizes = sbsizes;
353   if (sbrows) *sbrows = sbdata;
354   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
355   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
356   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361 {
362   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364   PetscInt          lsize,lsize_tmp,owner;
365   PetscMPIInt       rank;
366   Mat               amat,bmat;
367   PetscBool         done;
368   PetscLayout       cmap,rmap;
369   MPI_Comm          comm;
370   PetscErrorCode    ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
374   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
375   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
376   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
377   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
379   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380   /* is it a safe way to compute number of nonzero values ? */
381   tnz  = ai[an]+bi[bn];
382   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
383   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
384   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
385   /* it is a better way to estimate memory than the old implementation
386    * where global size of matrix is used
387    * */
388   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
389   for (i=0; i<nidx; i++) {
390     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
391     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
392     lsize_tmp = 0;
393     for (j=0; j<lsize; j++) {
394       owner = -1;
395       row   = indices[j];
396       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
397       if (owner != rank) continue;
398       /* local number */
399       row  -= rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for (k=start; k<end; k++) { /* Amat */
403         col = aj[k] + cstart;
404         indices_temp[lsize_tmp++] = col;
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for (k=start; k<end; k++) { /* Bmat */
409         col = gcols[bj[k]];
410         indices_temp[lsize_tmp++] = col;
411       }
412     }
413    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
414    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
415    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
416    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
417   }
418   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
419   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
420   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests recieved (which have to be or which have been processed
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscErrorCode ierr;
461   PetscMPIInt    size,rank,tag1,tag2;
462   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464   PetscBT        *table;
465   MPI_Comm       comm;
466   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467   MPI_Status     *s_status,*recv_status;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
477   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
478 
479   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
480 
481   for (i=0; i<imax; i++) {
482     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
483     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
489   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
490   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
491   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
492   for (i=0; i<imax; i++) {
493     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
494     idx_i = idx[i];
495     len   = n[i];
496     for (j=0; j<len; j++) {
497       row = idx_i[j];
498       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
500       w4[proc]++;
501     }
502     for (j=0; j<size; j++) {
503       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504     }
505   }
506 
507   nrqs     = 0;              /* no of outgoing messages */
508   msz      = 0;              /* total mesg length (for all proc */
509   w1[rank] = 0;              /* no mesg sent to intself */
510   w3[rank] = 0;
511   for (i=0; i<size; i++) {
512     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513   }
514   /* pa - is list of processors to communicate with */
515   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
516   for (i=0,j=0; i<size; i++) {
517     if (w1[i]) {pa[j] = i; j++;}
518   }
519 
520   /* Each message would have a header = 1 + 2*(no of IS) + data */
521   for (i=0; i<nrqs; i++) {
522     j      = pa[i];
523     w1[j] += w2[j] + 2*w3[j];
524     msz   += w1[j];
525   }
526 
527   /* Determine the number of messages to expect, their lengths, from from-ids */
528   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
529   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
530 
531   /* Now post the Irecvs corresponding to these messages */
532   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
533 
534   /* Allocate Memory for outgoing messages */
535   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
536   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
537   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
538 
539   {
540     PetscInt *iptr = tmp,ict  = 0;
541     for (i=0; i<nrqs; i++) {
542       j         = pa[i];
543       iptr     +=  ict;
544       outdat[j] = iptr;
545       ict       = w1[j];
546     }
547   }
548 
549   /* Form the outgoing messages */
550   /* plug in the headers */
551   for (i=0; i<nrqs; i++) {
552     j            = pa[i];
553     outdat[j][0] = 0;
554     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
555     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556   }
557 
558   /* Memory for doing local proc's work */
559   {
560     PetscInt M_BPB_imax = 0;
561 #if defined(PETSC_USE_CTABLE)
562     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
563     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
564     for (i=0; i<imax; i++) {
565       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
566     }
567     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570     }
571 #else
572     PetscInt Mimax = 0;
573     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
574     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
575     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
576     for (i=0; i<imax; i++) {
577       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578       data[i]  = d_p + M*i;
579     }
580 #endif
581   }
582 
583   /* Parse the IS and update local tables and the outgoing buf with the data */
584   {
585     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586     PetscBT  table_i;
587 
588     for (i=0; i<imax; i++) {
589       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
590       n_i     = n[i];
591       table_i = table[i];
592       idx_i   = idx[i];
593 #if defined(PETSC_USE_CTABLE)
594       table_data_i = table_data[i];
595 #else
596       data_i  = data[i];
597 #endif
598       isz_i   = isz[i];
599       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600         row  = idx_i[j];
601         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
602         if (proc != rank) { /* copy to the outgoing buffer */
603           ctr[proc]++;
604           *ptr[proc] = row;
605           ptr[proc]++;
606         } else if (!PetscBTLookupSet(table_i,row)) {
607 #if defined(PETSC_USE_CTABLE)
608           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
609 #else
610           data_i[isz_i] = row; /* Update the local table */
611 #endif
612           isz_i++;
613         }
614       }
615       /* Update the headers for the current IS */
616       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617         if ((ctr_j = ctr[j])) {
618           outdat_j        = outdat[j];
619           k               = ++outdat_j[0];
620           outdat_j[2*k]   = ctr_j;
621           outdat_j[2*k-1] = i;
622         }
623       }
624       isz[i] = isz_i;
625     }
626   }
627 
628   /*  Now  post the sends */
629   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
630   for (i=0; i<nrqs; ++i) {
631     j    = pa[i];
632     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
633   }
634 
635   /* No longer need the original indices */
636   for (i=0; i<imax; ++i) {
637     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
638   }
639   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
640 
641   for (i=0; i<imax; ++i) {
642     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
643   }
644 
645   /* Do Local work */
646 #if defined(PETSC_USE_CTABLE)
647   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
648 #else
649   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
650 #endif
651 
652   /* Receive messages */
653   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
654   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
655 
656   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
657   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
658 
659   /* Phase 1 sends are complete - deallocate buffers */
660   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
661   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
662 
663   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
664   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
665   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
666   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
667   ierr = PetscFree(rbuf);CHKERRQ(ierr);
668 
669 
670   /* Send the data back */
671   /* Do a global reduction to know the buffer space req for incoming messages */
672   {
673     PetscMPIInt *rw1;
674 
675     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
676 
677     for (i=0; i<nrqr; ++i) {
678       proc = recv_status[i].MPI_SOURCE;
679 
680       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681       rw1[proc] = isz1[i];
682     }
683     ierr = PetscFree(onodes1);CHKERRQ(ierr);
684     ierr = PetscFree(olengths1);CHKERRQ(ierr);
685 
686     /* Determine the number of messages to expect, their lengths, from from-ids */
687     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
688     ierr = PetscFree(rw1);CHKERRQ(ierr);
689   }
690   /* Now post the Irecvs corresponding to these messages */
691   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
692 
693   /* Now  post the sends */
694   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
695   for (i=0; i<nrqr; ++i) {
696     j    = recv_status[i].MPI_SOURCE;
697     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
698   }
699 
700   /* receive work done on other processors */
701   {
702     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703     PetscMPIInt idex;
704     PetscBT     table_i;
705     MPI_Status  *status2;
706 
707     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
708     for (i=0; i<nrqs; ++i) {
709       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
710       /* Process the message */
711       rbuf2_i = rbuf2[idex];
712       ct1     = 2*rbuf2_i[0]+1;
713       jmax    = rbuf2[idex][0];
714       for (j=1; j<=jmax; j++) {
715         max     = rbuf2_i[2*j];
716         is_no   = rbuf2_i[2*j-1];
717         isz_i   = isz[is_no];
718         table_i = table[is_no];
719 #if defined(PETSC_USE_CTABLE)
720         table_data_i = table_data[is_no];
721 #else
722         data_i  = data[is_no];
723 #endif
724         for (k=0; k<max; k++,ct1++) {
725           row = rbuf2_i[ct1];
726           if (!PetscBTLookupSet(table_i,row)) {
727 #if defined(PETSC_USE_CTABLE)
728             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
729 #else
730             data_i[isz_i] = row;
731 #endif
732             isz_i++;
733           }
734         }
735         isz[is_no] = isz_i;
736       }
737     }
738 
739     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
740     ierr = PetscFree(status2);CHKERRQ(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767   }
768 
769   ierr = PetscFree(onodes2);CHKERRQ(ierr);
770   ierr = PetscFree(olengths2);CHKERRQ(ierr);
771 
772   ierr = PetscFree(pa);CHKERRQ(ierr);
773   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
774   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
775   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
776   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
779   ierr = PetscFree(s_status);CHKERRQ(ierr);
780   ierr = PetscFree(recv_status);CHKERRQ(ierr);
781   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
782   ierr = PetscFree(xdata);CHKERRQ(ierr);
783   ierr = PetscFree(isz1);CHKERRQ(ierr);
784 #if defined(PETSC_USE_CTABLE)
785   for (i=0; i<imax; i++) {
786     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
787   }
788   ierr = PetscFree(table_data);CHKERRQ(ierr);
789   ierr = PetscFree(tdata);CHKERRQ(ierr);
790   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
791 #else
792   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
793 #endif
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799        the work on the local processor.
800 
801      Inputs:
802       C      - MAT_MPIAIJ;
803       imax - total no of index sets processed at a time;
804       table  - an array of char - size = m bits.
805 
806      Output:
807       isz    - array containing the count of the solution elements corresponding
808                to each index set;
809       data or table_data  - pointer to the solutions
810 */
811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812 {
813   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814   Mat        A  = c->A,B = c->B;
815   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818   PetscBT    table_i;
819 #if defined(PETSC_USE_CTABLE)
820   PetscTable         table_data_i;
821   PetscErrorCode     ierr;
822   PetscTablePosition tpos;
823   PetscInt           tcount,*tdata;
824 #else
825   PetscInt           *data_i;
826 #endif
827 
828   PetscFunctionBegin;
829   rstart = C->rmap->rstart;
830   cstart = C->cmap->rstart;
831   ai     = a->i;
832   aj     = a->j;
833   bi     = b->i;
834   bj     = b->j;
835   garray = c->garray;
836 
837   for (i=0; i<imax; i++) {
838 #if defined(PETSC_USE_CTABLE)
839     /* copy existing entries of table_data_i into tdata[] */
840     table_data_i = table_data[i];
841     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
842     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
843 
844     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
845     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
846     while (tpos) {
847       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
848       tdata[--j] = --row;
849       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850     }
851 #else
852     data_i  = data[i];
853 #endif
854     table_i = table[i];
855     isz_i   = isz[i];
856     max     = isz[i];
857 
858     for (j=0; j<max; j++) {
859 #if defined(PETSC_USE_CTABLE)
860       row   = tdata[j] - rstart;
861 #else
862       row   = data_i[j] - rstart;
863 #endif
864       start = ai[row];
865       end   = ai[row+1];
866       for (k=start; k<end; k++) { /* Amat */
867         val = aj[k] + cstart;
868         if (!PetscBTLookupSet(table_i,val)) {
869 #if defined(PETSC_USE_CTABLE)
870           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
871 #else
872           data_i[isz_i] = val;
873 #endif
874           isz_i++;
875         }
876       }
877       start = bi[row];
878       end   = bi[row+1];
879       for (k=start; k<end; k++) { /* Bmat */
880         val = garray[bj[k]];
881         if (!PetscBTLookupSet(table_i,val)) {
882 #if defined(PETSC_USE_CTABLE)
883           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
884 #else
885           data_i[isz_i] = val;
886 #endif
887           isz_i++;
888         }
889       }
890     }
891     isz[i] = isz_i;
892 
893 #if defined(PETSC_USE_CTABLE)
894     ierr = PetscFree(tdata);CHKERRQ(ierr);
895 #endif
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902          and return the output
903 
904          Input:
905            C    - the matrix
906            nrqr - no of messages being processed.
907            rbuf - an array of pointers to the recieved requests
908 
909          Output:
910            xdata - array of messages to be sent back
911            isz1  - size of each message
912 
913   For better efficiency perhaps we should malloc separately each xdata[i],
914 then if a remalloc is required we need only copy the data for that one row
915 rather then all previous rows as it is now where a single large chunck of
916 memory is used.
917 
918 */
919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920 {
921   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922   Mat            A  = c->A,B = c->B;
923   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
924   PetscErrorCode ierr;
925   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928   PetscInt       *rbuf_i,kmax,rbuf_0;
929   PetscBT        xtable;
930 
931   PetscFunctionBegin;
932   m      = C->rmap->N;
933   rstart = C->rmap->rstart;
934   cstart = C->cmap->rstart;
935   ai     = a->i;
936   aj     = a->j;
937   bi     = b->i;
938   bj     = b->j;
939   garray = c->garray;
940 
941 
942   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943     rbuf_i =  rbuf[i];
944     rbuf_0 =  rbuf_i[0];
945     ct    += rbuf_0;
946     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947   }
948 
949   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950   else max1 = 1;
951   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
953   ++no_malloc;
954   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
955   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
956 
957   ct3 = 0;
958   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959     rbuf_i =  rbuf[i];
960     rbuf_0 =  rbuf_i[0];
961     ct1    =  2*rbuf_0+1;
962     ct2    =  ct1;
963     ct3   += ct1;
964     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
966       oct2 = ct2;
967       kmax = rbuf_i[2*j];
968       for (k=0; k<kmax; k++,ct1++) {
969         row = rbuf_i[ct1];
970         if (!PetscBTLookupSet(xtable,row)) {
971           if (!(ct3 < mem_estimate)) {
972             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
974             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
975             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
976             xdata[0]     = tmp;
977             mem_estimate = new_estimate; ++no_malloc;
978             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979           }
980           xdata[i][ct2++] = row;
981           ct3++;
982         }
983       }
984       for (k=oct2,max2=ct2; k<max2; k++) {
985         row   = xdata[i][k] - rstart;
986         start = ai[row];
987         end   = ai[row+1];
988         for (l=start; l<end; l++) {
989           val = aj[l] + cstart;
990           if (!PetscBTLookupSet(xtable,val)) {
991             if (!(ct3 < mem_estimate)) {
992               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
994               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
995               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
996               xdata[0]     = tmp;
997               mem_estimate = new_estimate; ++no_malloc;
998               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999             }
1000             xdata[i][ct2++] = val;
1001             ct3++;
1002           }
1003         }
1004         start = bi[row];
1005         end   = bi[row+1];
1006         for (l=start; l<end; l++) {
1007           val = garray[bj[l]];
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022       }
1023       /* Update the header*/
1024       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026     }
1027     xdata[i][0] = rbuf_0;
1028     xdata[i+1]  = xdata[i] + ct2;
1029     isz1[i]     = ct2; /* size of each message */
1030   }
1031   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1032   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 /* -------------------------------------------------------------------------*/
1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1038 /*
1039     Every processor gets the entire matrix
1040 */
1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1042 {
1043   Mat            B;
1044   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1045   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1046   PetscErrorCode ierr;
1047   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1048   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1049   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1050   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1051 
1052   PetscFunctionBegin;
1053   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1054   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1055 
1056   if (scall == MAT_INITIAL_MATRIX) {
1057     /* ----------------------------------------------------------------
1058          Tell every processor the number of nonzeros per row
1059     */
1060     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1061     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1062       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1063     }
1064     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1065     for (i=0; i<size; i++) {
1066       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1067       displs[i]     = A->rmap->range[i];
1068     }
1069 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1070     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1071 #else
1072     sendcount = A->rmap->rend - A->rmap->rstart;
1073     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1074 #endif
1075     /* ---------------------------------------------------------------
1076          Create the sequential matrix of the same type as the local block diagonal
1077     */
1078     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1079     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1080     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1081     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1082     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1083     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1084     **Bin = B;
1085     b     = (Mat_SeqAIJ*)B->data;
1086 
1087     /*--------------------------------------------------------------------
1088        Copy my part of matrix column indices over
1089     */
1090     sendcount  = ad->nz + bd->nz;
1091     jsendbuf   = b->j + b->i[rstarts[rank]];
1092     a_jsendbuf = ad->j;
1093     b_jsendbuf = bd->j;
1094     n          = A->rmap->rend - A->rmap->rstart;
1095     cnt        = 0;
1096     for (i=0; i<n; i++) {
1097 
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     sendcount = ad->nz + bd->nz;
1161     sendbuf   = b->a + b->i[rstarts[rank]];
1162     a_sendbuf = ad->a;
1163     b_sendbuf = bd->a;
1164     b_sendj   = bd->j;
1165     n         = A->rmap->rend - A->rmap->rstart;
1166     cnt       = 0;
1167     for (i=0; i<n; i++) {
1168 
1169       /* put in lower diagonal portion */
1170       m = bd->i[i+1] - bd->i[i];
1171       while (m > 0) {
1172         /* is it above diagonal (in bd (compressed) numbering) */
1173         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1174         sendbuf[cnt++] = *b_sendbuf++;
1175         m--;
1176         b_sendj++;
1177       }
1178 
1179       /* put in diagonal portion */
1180       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1181         sendbuf[cnt++] = *a_sendbuf++;
1182       }
1183 
1184       /* put in upper diagonal portion */
1185       while (m-- > 0) {
1186         sendbuf[cnt++] = *b_sendbuf++;
1187         b_sendj++;
1188       }
1189     }
1190     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1191 
1192     /* -----------------------------------------------------------------
1193        Gather all numerical values to all processors
1194     */
1195     if (!recvcounts) {
1196       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1197     }
1198     for (i=0; i<size; i++) {
1199       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1200     }
1201     displs[0] = 0;
1202     for (i=1; i<size; i++) {
1203       displs[i] = displs[i-1] + recvcounts[i-1];
1204     }
1205     recvbuf = b->a;
1206 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1207     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1208 #else
1209     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1210 #endif
1211   }  /* endof (flag == MAT_GET_VALUES) */
1212   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1213 
1214   if (A->symmetric) {
1215     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1216   } else if (A->hermitian) {
1217     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1218   } else if (A->structurally_symmetric) {
1219     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1220   }
1221   PetscFunctionReturn(0);
1222 }
1223 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1224 {
1225   PetscErrorCode ierr;
1226   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1227   Mat_SubMat     *submatj = c->submatis1;
1228   PetscInt       i;
1229 
1230   PetscFunctionBegin;
1231   ierr = submatj->destroy(C);CHKERRQ(ierr);
1232 
1233   ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1234 
1235   for (i=0; i<submatj->nrqr; ++i) {
1236     ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1237   }
1238   ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1239 
1240   ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1241   ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1242 
1243   for (i=0; i<submatj->nrqs; ++i) {
1244     ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1245   }
1246   ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1247   ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1248 
1249   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1250 
1251 #if defined(PETSC_USE_CTABLE)
1252   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1253   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1254   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1255 #else
1256   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1257 #endif
1258 
1259   if (!submatj->allcolumns) {
1260 #if defined(PETSC_USE_CTABLE)
1261     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1262 #else
1263     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1264 #endif
1265   }
1266 
1267   ierr = PetscFree(submatj);CHKERRQ(ierr);
1268   PetscFunctionReturn(0);
1269 }
1270 
1271 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1272 {
1273   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1274   Mat            submat,A = c->A,B = c->B;
1275   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1276   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1277   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1278   const PetscInt *icol,*irow;
1279   PetscInt       nrow,ncol,start;
1280   PetscErrorCode ierr;
1281   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1282   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1283   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1284   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1285   PetscInt       *lens,rmax,ncols,*cols,Crow;
1286 #if defined(PETSC_USE_CTABLE)
1287   PetscTable     cmap,rmap;
1288   PetscInt       *cmap_loc,*rmap_loc;
1289 #else
1290   PetscInt       *cmap,*rmap;
1291 #endif
1292   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1293   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1294   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1295   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1296   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1297   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1298   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1299   MPI_Comm       comm;
1300   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1301   PetscMPIInt    *onodes1,*olengths1,idex,end;
1302   Mat_SubMat     *smatis1;
1303   PetscBool      isrowsorted;
1304 
1305   PetscFunctionBegin;
1306   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1307 
1308   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1309   size = c->size;
1310   rank = c->rank;
1311 
1312   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1313   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1314 
1315   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1316   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1317   if (allcolumns) {
1318     icol = NULL;
1319     ncol = C->cmap->N;
1320   } else {
1321     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1322     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1323   }
1324 
1325   if (scall == MAT_INITIAL_MATRIX) {
1326     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1327 
1328     /* Get some new tags to keep the communication clean */
1329     tag1 = ((PetscObject)C)->tag;
1330     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1331     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1332 
1333     /* evaluate communication - mesg to who, length of mesg, and buffer space
1334      required. Based on this, buffers are allocated, and data copied into them */
1335     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1336     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1337 
1338     /* w1[proc] = num of rows owned by proc -- to be requested */
1339     proc = 0;
1340     nrqs = 0; /* num of outgoing messages */
1341     for (j=0; j<nrow; j++) {
1342       row  = irow[j]; /* sorted! */
1343       while (row >= C->rmap->range[proc+1]) proc++;
1344       w1[proc]++;
1345       row2proc[j] = proc; /* map row index to proc */
1346 
1347       if (proc != rank && !w2[proc]) {
1348         w2[proc] = 1; nrqs++;
1349       }
1350     }
1351     w1[rank] = 0;  /* rows owned by self will not be requested */
1352 
1353     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1354     for (proc=0,j=0; proc<size; proc++) {
1355       if (w1[proc]) { pa[j++] = proc;}
1356     }
1357 
1358     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1359     msz = 0;              /* total mesg length (for all procs) */
1360     for (i=0; i<nrqs; i++) {
1361       proc      = pa[i];
1362       w1[proc] += 3;
1363       msz      += w1[proc];
1364     }
1365     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1366 
1367     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1368     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1369     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1370 
1371     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1372        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1373     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1374 
1375     /* Now post the Irecvs corresponding to these messages */
1376     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1377 
1378     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1379     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1380 
1381     /* Allocate Memory for outgoing messages */
1382     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1383     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1384     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1385 
1386     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1387     iptr = tmp;
1388     for (i=0; i<nrqs; i++) {
1389       proc        = pa[i];
1390       sbuf1[proc] = iptr;
1391       iptr       += w1[proc];
1392     }
1393 
1394     /* Form the outgoing messages */
1395     /* Initialize the header space */
1396     for (i=0; i<nrqs; i++) {
1397       proc      = pa[i];
1398       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1399       ptr[proc] = sbuf1[proc] + 3;
1400     }
1401 
1402     /* Parse the isrow and copy data into outbuf */
1403     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1404     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1405       proc = row2proc[j];
1406       if (proc != rank) { /* copy to the outgoing buf*/
1407         *ptr[proc] = irow[j];
1408         ctr[proc]++; ptr[proc]++;
1409       }
1410     }
1411 
1412     /* Update the headers for the current IS */
1413     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1414       if ((ctr_j = ctr[j])) {
1415         sbuf1_j        = sbuf1[j];
1416         k              = ++sbuf1_j[0];
1417         sbuf1_j[2*k]   = ctr_j;
1418         sbuf1_j[2*k-1] = 0;
1419       }
1420     }
1421 
1422     /* Now post the sends */
1423     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1424     for (i=0; i<nrqs; ++i) {
1425       proc = pa[i];
1426       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1427     }
1428 
1429     /* Post Receives to capture the buffer size */
1430     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1431     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1432 
1433     rbuf2[0] = tmp + msz;
1434     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1435 
1436     for (i=0; i<nrqs; ++i) {
1437       proc = pa[i];
1438       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1439     }
1440 
1441     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1442 
1443     /* Send to other procs the buf size they should allocate */
1444     /* Receive messages*/
1445     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1446     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1447 
1448     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1449     for (i=0; i<nrqr; ++i) {
1450       req_size[i] = 0;
1451       rbuf1_i        = rbuf1[i];
1452       start          = 2*rbuf1_i[0] + 1;
1453       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1454       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1455       sbuf2_i        = sbuf2[i];
1456       for (j=start; j<end; j++) {
1457         k            = rbuf1_i[j] - rstart;
1458         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1459         sbuf2_i[j]   = ncols;
1460         req_size[i] += ncols;
1461       }
1462       req_source1[i] = r_status1[i].MPI_SOURCE;
1463 
1464       /* form the header */
1465       sbuf2_i[0] = req_size[i];
1466       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1467 
1468       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1469     }
1470 
1471     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1472     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1473 
1474     /* rbuf2 is received, Post recv column indices a->j */
1475     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1476 
1477     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1478     for (i=0; i<nrqs; ++i) {
1479       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1480       req_source2[i] = r_status2[i].MPI_SOURCE;
1481       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1482     }
1483 
1484     /* Wait on sends1 and sends2 */
1485     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1486     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1487     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1488     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1489 
1490     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1491     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1492 
1493     /* Now allocate sending buffers for a->j, and send them off */
1494     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1495     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1496     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1497     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1498 
1499     for (i=0; i<nrqr; i++) { /* for each requested message */
1500       rbuf1_i   = rbuf1[i];
1501       sbuf_aj_i = sbuf_aj[i];
1502       ct1       = 2*rbuf1_i[0] + 1;
1503       ct2       = 0;
1504       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1505 
1506       kmax = rbuf1[i][2];
1507       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1508         row    = rbuf1_i[ct1] - rstart;
1509         nzA    = ai[row+1] - ai[row];
1510         nzB    = bi[row+1] - bi[row];
1511         ncols  = nzA + nzB;
1512         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1513 
1514         /* load the column indices for this row into cols*/
1515         cols = sbuf_aj_i + ct2;
1516 
1517         lwrite = 0;
1518         for (l=0; l<nzB; l++) {
1519           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1520         }
1521         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1522         for (l=0; l<nzB; l++) {
1523           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1524         }
1525 
1526         ct2 += ncols;
1527       }
1528       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1529     }
1530 
1531     /* create column map (cmap): global col of C -> local col of submat */
1532 #if defined(PETSC_USE_CTABLE)
1533     if (!allcolumns) {
1534       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1535       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1536       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1537         if (icol[j] >= cstart && icol[j] <cend) {
1538           cmap_loc[icol[j] - cstart] = j+1;
1539         } else { /* use PetscTable for non-local col indices */
1540           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1541         }
1542       }
1543     } else {
1544       cmap     = NULL;
1545       cmap_loc = NULL;
1546     }
1547     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1548 #else
1549     if (!allcolumns) {
1550       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1551       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1552     } else {
1553       cmap = NULL;
1554     }
1555 #endif
1556 
1557     /* Create lens for MatSeqAIJSetPreallocation() */
1558     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1559 
1560     /* Compute lens from local part of C */
1561     for (j=0; j<nrow; j++) {
1562       row  = irow[j];
1563       proc = row2proc[j];
1564       if (proc == rank) {
1565         /* diagonal part A = c->A */
1566         ncols = ai[row-rstart+1] - ai[row-rstart];
1567         cols  = aj + ai[row-rstart];
1568         if (!allcolumns) {
1569           for (k=0; k<ncols; k++) {
1570 #if defined(PETSC_USE_CTABLE)
1571             tcol = cmap_loc[cols[k]];
1572 #else
1573             tcol = cmap[cols[k]+cstart];
1574 #endif
1575             if (tcol) lens[j]++;
1576           }
1577         } else { /* allcolumns */
1578           lens[j] = ncols;
1579         }
1580 
1581         /* off-diagonal part B = c->B */
1582         ncols = bi[row-rstart+1] - bi[row-rstart];
1583         cols  = bj + bi[row-rstart];
1584         if (!allcolumns) {
1585           for (k=0; k<ncols; k++) {
1586 #if defined(PETSC_USE_CTABLE)
1587             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1588 #else
1589             tcol = cmap[bmap[cols[k]]];
1590 #endif
1591             if (tcol) lens[j]++;
1592           }
1593         } else { /* allcolumns */
1594           lens[j] += ncols;
1595         }
1596       }
1597     }
1598 
1599     /* Create row map (rmap): global row of C -> local row of submat */
1600 #if defined(PETSC_USE_CTABLE)
1601     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1602     for (j=0; j<nrow; j++) {
1603       row  = irow[j];
1604       proc = row2proc[j];
1605       if (proc == rank) { /* a local row */
1606         rmap_loc[row - rstart] = j;
1607       } else {
1608         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1609       }
1610     }
1611 #else
1612     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1613     for (j=0; j<nrow; j++) {
1614       rmap[irow[j]] = j;
1615     }
1616 #endif
1617 
1618     /* Update lens from offproc data */
1619     /* recv a->j is done */
1620     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1621     for (i=0; i<nrqs; i++) {
1622       proc    = pa[i];
1623       sbuf1_i = sbuf1[proc];
1624       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1625       ct1     = 2 + 1;
1626       ct2     = 0;
1627       rbuf2_i = rbuf2[i]; /* received length of C->j */
1628       rbuf3_i = rbuf3[i]; /* received C->j */
1629 
1630       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1631       max1   = sbuf1_i[2];
1632       for (k=0; k<max1; k++,ct1++) {
1633 #if defined(PETSC_USE_CTABLE)
1634         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1635         row--;
1636         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1637 #else
1638         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1639 #endif
1640         /* Now, store row index of submat in sbuf1_i[ct1] */
1641         sbuf1_i[ct1] = row;
1642 
1643         nnz = rbuf2_i[ct1];
1644         if (!allcolumns) {
1645           for (l=0; l<nnz; l++,ct2++) {
1646 #if defined(PETSC_USE_CTABLE)
1647             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1648               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1649             } else {
1650               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1651             }
1652 #else
1653             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1654 #endif
1655             if (tcol) lens[row]++;
1656           }
1657         } else { /* allcolumns */
1658           lens[row] += nnz;
1659         }
1660       }
1661     }
1662     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1663     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1664 
1665     /* Create the submatrices */
1666     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1667     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1668 
1669     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1670     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1671     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1672     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1673     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1674 
1675     /* create struct Mat_SubMat and attached it to submat */
1676     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1677     subc = (Mat_SeqAIJ*)submat->data;
1678     subc->submatis1 = smatis1;
1679 
1680     smatis1->nrqs        = nrqs;
1681     smatis1->nrqr        = nrqr;
1682     smatis1->rbuf1       = rbuf1;
1683     smatis1->rbuf2       = rbuf2;
1684     smatis1->rbuf3       = rbuf3;
1685     smatis1->sbuf2       = sbuf2;
1686     smatis1->req_source2 = req_source2;
1687 
1688     smatis1->sbuf1       = sbuf1;
1689     smatis1->ptr         = ptr;
1690     smatis1->tmp         = tmp;
1691     smatis1->ctr         = ctr;
1692 
1693     smatis1->pa           = pa;
1694     smatis1->req_size     = req_size;
1695     smatis1->req_source1  = req_source1;
1696 
1697     smatis1->allcolumns  = allcolumns;
1698     smatis1->row2proc    = row2proc;
1699     smatis1->rmap        = rmap;
1700     smatis1->cmap        = cmap;
1701 #if defined(PETSC_USE_CTABLE)
1702     smatis1->rmap_loc    = rmap_loc;
1703     smatis1->cmap_loc    = cmap_loc;
1704 #endif
1705 
1706     smatis1->destroy     = submat->ops->destroy;
1707     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1708     submat->factortype   = C->factortype;
1709 
1710     /* compute rmax */
1711     rmax = 0;
1712     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1713 
1714   } else { /* scall == MAT_REUSE_MATRIX */
1715     submat = submats[0];
1716     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1717 
1718     subc    = (Mat_SeqAIJ*)submat->data;
1719     rmax    = subc->rmax;
1720     smatis1 = subc->submatis1;
1721     nrqs        = smatis1->nrqs;
1722     nrqr        = smatis1->nrqr;
1723     rbuf1       = smatis1->rbuf1;
1724     rbuf2       = smatis1->rbuf2;
1725     rbuf3       = smatis1->rbuf3;
1726     req_source2 = smatis1->req_source2;
1727 
1728     sbuf1     = smatis1->sbuf1;
1729     sbuf2     = smatis1->sbuf2;
1730     ptr       = smatis1->ptr;
1731     tmp       = smatis1->tmp;
1732     ctr       = smatis1->ctr;
1733 
1734     pa         = smatis1->pa;
1735     req_size   = smatis1->req_size;
1736     req_source1 = smatis1->req_source1;
1737 
1738     allcolumns = smatis1->allcolumns;
1739     row2proc   = smatis1->row2proc;
1740     rmap       = smatis1->rmap;
1741     cmap       = smatis1->cmap;
1742 #if defined(PETSC_USE_CTABLE)
1743     rmap_loc   = smatis1->rmap_loc;
1744     cmap_loc   = smatis1->cmap_loc;
1745 #endif
1746   }
1747 
1748   /* Post recv matrix values */
1749   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1750   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1751   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1752   for (i=0; i<nrqs; ++i) {
1753     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1754     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1755   }
1756 
1757   /* Allocate sending buffers for a->a, and send them off */
1758   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1759   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1760   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1761   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1762 
1763   for (i=0; i<nrqr; i++) {
1764     rbuf1_i   = rbuf1[i];
1765     sbuf_aa_i = sbuf_aa[i];
1766     ct1       = 2*rbuf1_i[0]+1;
1767     ct2       = 0;
1768     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1769 
1770     kmax = rbuf1_i[2];
1771     for (k=0; k<kmax; k++,ct1++) {
1772       row = rbuf1_i[ct1] - rstart;
1773       nzA = ai[row+1] - ai[row];
1774       nzB = bi[row+1] - bi[row];
1775       ncols  = nzA + nzB;
1776       cworkB = bj + bi[row];
1777       vworkA = a_a + ai[row];
1778       vworkB = b_a + bi[row];
1779 
1780       /* load the column values for this row into vals*/
1781       vals = sbuf_aa_i + ct2;
1782 
1783       lwrite = 0;
1784       for (l=0; l<nzB; l++) {
1785         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1786       }
1787       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1788       for (l=0; l<nzB; l++) {
1789         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1790       }
1791 
1792       ct2 += ncols;
1793     }
1794     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1795   }
1796 
1797   /* Assemble submat */
1798   /* First assemble the local rows */
1799   for (j=0; j<nrow; j++) {
1800     row  = irow[j];
1801     proc = row2proc[j];
1802     if (proc == rank) {
1803       Crow = row - rstart;  /* local row index of C */
1804 #if defined(PETSC_USE_CTABLE)
1805       row = rmap_loc[Crow]; /* row index of submat */
1806 #else
1807       row = rmap[row];
1808 #endif
1809 
1810       if (allcolumns) {
1811         /* diagonal part A = c->A */
1812         ncols = ai[Crow+1] - ai[Crow];
1813         cols  = aj + ai[Crow];
1814         vals  = a->a + ai[Crow];
1815         i     = 0;
1816         for (k=0; k<ncols; k++) {
1817           subcols[i]   = cols[k] + cstart;
1818           subvals[i++] = vals[k];
1819         }
1820 
1821         /* off-diagonal part B = c->B */
1822         ncols = bi[Crow+1] - bi[Crow];
1823         cols  = bj + bi[Crow];
1824         vals  = b->a + bi[Crow];
1825         for (k=0; k<ncols; k++) {
1826           subcols[i]   = bmap[cols[k]];
1827           subvals[i++] = vals[k];
1828         }
1829 
1830         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1831 
1832       } else { /* !allcolumns */
1833 #if defined(PETSC_USE_CTABLE)
1834         /* diagonal part A = c->A */
1835         ncols = ai[Crow+1] - ai[Crow];
1836         cols  = aj + ai[Crow];
1837         vals  = a->a + ai[Crow];
1838         i     = 0;
1839         for (k=0; k<ncols; k++) {
1840           tcol = cmap_loc[cols[k]];
1841           if (tcol) {
1842             subcols[i]   = --tcol;
1843             subvals[i++] = vals[k];
1844           }
1845         }
1846 
1847         /* off-diagonal part B = c->B */
1848         ncols = bi[Crow+1] - bi[Crow];
1849         cols  = bj + bi[Crow];
1850         vals  = b->a + bi[Crow];
1851         for (k=0; k<ncols; k++) {
1852           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1853           if (tcol) {
1854             subcols[i]   = --tcol;
1855             subvals[i++] = vals[k];
1856           }
1857         }
1858 #else
1859         /* diagonal part A = c->A */
1860         ncols = ai[Crow+1] - ai[Crow];
1861         cols  = aj + ai[Crow];
1862         vals  = a->a + ai[Crow];
1863         i     = 0;
1864         for (k=0; k<ncols; k++) {
1865           tcol = cmap[cols[k]+cstart];
1866           if (tcol) {
1867             subcols[i]   = --tcol;
1868             subvals[i++] = vals[k];
1869           }
1870         }
1871 
1872         /* off-diagonal part B = c->B */
1873         ncols = bi[Crow+1] - bi[Crow];
1874         cols  = bj + bi[Crow];
1875         vals  = b->a + bi[Crow];
1876         for (k=0; k<ncols; k++) {
1877           tcol = cmap[bmap[cols[k]]];
1878           if (tcol) {
1879             subcols[i]   = --tcol;
1880             subvals[i++] = vals[k];
1881           }
1882         }
1883 #endif
1884         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1885       }
1886     }
1887   }
1888 
1889   /* Now assemble the off-proc rows */
1890   for (i=0; i<nrqs; i++) { /* for each requested message */
1891     /* recv values from other processes */
1892     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1893     proc    = pa[idex];
1894     sbuf1_i = sbuf1[proc];
1895     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1896     ct1     = 2 + 1;
1897     ct2     = 0; /* count of received C->j */
1898     ct3     = 0; /* count of received C->j that will be inserted into submat */
1899     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1900     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1901     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1902 
1903     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1904     max1 = sbuf1_i[2];             /* num of rows */
1905     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1906       row = sbuf1_i[ct1]; /* row index of submat */
1907       if (!allcolumns) {
1908         idex = 0;
1909         if (scall == MAT_INITIAL_MATRIX) {
1910           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1911           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1912 #if defined(PETSC_USE_CTABLE)
1913             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1914               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1915             } else {
1916               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1917             }
1918 #else
1919             tcol = cmap[rbuf3_i[ct2]];
1920 #endif
1921             if (tcol) {
1922               subcols[idex]   = --tcol;
1923               subvals[idex++] = rbuf4_i[ct2];
1924 
1925               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1926                For reuse, we replace received C->j with index that should be inserted to submat */
1927               rbuf3_i[ct3++] = ct2;
1928             }
1929           }
1930           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1931 
1932         } else { /* scall == MAT_REUSE_MATRIX */
1933           submat = submats[0];
1934           subc   = (Mat_SeqAIJ*)submat->data;
1935 
1936           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1937           for (l=0; l<nnz; l++) {
1938             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1939             subvals[idex++] = rbuf4_i[ct2];
1940           }
1941 
1942           bj = subc->j + subc->i[row];
1943           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1944         }
1945       } else { /* allcolumns */
1946         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1947         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1948         ct2 += nnz;
1949       }
1950     }
1951   }
1952 
1953   /* sending a->a are done */
1954   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1955   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1956 
1957   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1958   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1959   submats[0] = submat;
1960 
1961   /* Restore the indices */
1962   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1963   if (!allcolumns) {
1964     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1965   }
1966 
1967   /* Destroy allocated memory */
1968   for (i=0; i<nrqs; ++i) {
1969     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1970   }
1971   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1972   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1973   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1974 
1975   if (scall == MAT_INITIAL_MATRIX) {
1976     ierr = PetscFree(lens);CHKERRQ(ierr);
1977     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1978     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1979   }
1980   PetscFunctionReturn(0);
1981 }
1982 
1983 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1984 {
1985   PetscErrorCode ierr;
1986   PetscInt       ncol;
1987   PetscBool      colflag,allcolumns=PETSC_FALSE;
1988 
1989   PetscFunctionBegin;
1990   /* Allocate memory to hold all the submatrices */
1991   if (scall != MAT_REUSE_MATRIX) {
1992     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
1993   }
1994 
1995   /* Check for special case: each processor gets entire matrix columns */
1996   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1997   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1998   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1999 
2000   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2001   PetscFunctionReturn(0);
2002 }
2003 
2004 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2005 {
2006   PetscErrorCode ierr;
2007   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
2008   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
2009 
2010   PetscFunctionBegin;
2011   /* Check for special case: each processor gets entire matrix */
2012   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */
2013     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2014     PetscFunctionReturn(0);
2015   }
2016 
2017   if (ismax == 1 && C->rmap->N == C->cmap->N) {
2018     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2019     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2020     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2021     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2022     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2023       wantallmatrix = PETSC_TRUE;
2024 
2025       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2026     }
2027   }
2028   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2029   if (twantallmatrix) {
2030     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2031     PetscFunctionReturn(0);
2032   }
2033 
2034   /* Allocate memory to hold all the submatrices */
2035   if (scall != MAT_REUSE_MATRIX) {
2036     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
2037   }
2038 
2039   /* Check for special case: each processor gets entire matrix columns */
2040   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
2041   for (i=0; i<ismax; i++) {
2042     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2043     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
2044     if (colflag && ncol == C->cmap->N) {
2045       allcolumns[i] = PETSC_TRUE;
2046     } else {
2047       allcolumns[i] = PETSC_FALSE;
2048     }
2049   }
2050 
2051   /* Determine the number of stages through which submatrices are done */
2052   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2053 
2054   /*
2055      Each stage will extract nmax submatrices.
2056      nmax is determined by the matrix column dimension.
2057      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2058   */
2059   if (!nmax) nmax = 1;
2060   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
2061 
2062   /* Make sure every processor loops through the nstages */
2063   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2064 
2065   for (i=0,pos=0; i<nstages; i++) {
2066     if (pos+nmax <= ismax) max_no = nmax;
2067     else if (pos == ismax) max_no = 0;
2068     else                   max_no = ismax-pos;
2069     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
2070     pos += max_no;
2071   }
2072 
2073   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
2074   PetscFunctionReturn(0);
2075 }
2076 
2077 /* -------------------------------------------------------------------------*/
2078 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
2079 {
2080   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2081   Mat            A  = c->A;
2082   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
2083   const PetscInt **icol,**irow;
2084   PetscInt       *nrow,*ncol,start;
2085   PetscErrorCode ierr;
2086   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
2087   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
2088   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
2089   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
2090   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2091 #if defined(PETSC_USE_CTABLE)
2092   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2093 #else
2094   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2095 #endif
2096   const PetscInt *irow_i;
2097   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2098   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2099   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2100   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2101   MPI_Status     *r_status3,*r_status4,*s_status4;
2102   MPI_Comm       comm;
2103   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
2104   PetscMPIInt    *onodes1,*olengths1;
2105   PetscMPIInt    idex,idex2,end;
2106 
2107   PetscFunctionBegin;
2108   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2109   tag0 = ((PetscObject)C)->tag;
2110   size = c->size;
2111   rank = c->rank;
2112 
2113   /* Get some new tags to keep the communication clean */
2114   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
2115   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2116   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2117 
2118   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
2119 
2120   for (i=0; i<ismax; i++) {
2121     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2122     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2123     if (allcolumns[i]) {
2124       icol[i] = NULL;
2125       ncol[i] = C->cmap->N;
2126     } else {
2127       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2128       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2129     }
2130   }
2131 
2132   /* evaluate communication - mesg to who, length of mesg, and buffer space
2133      required. Based on this, buffers are allocated, and data copied into them*/
2134   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
2135   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2136   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2137   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2138   for (i=0; i<ismax; i++) {
2139     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
2140     jmax   = nrow[i];
2141     irow_i = irow[i];
2142     for (j=0; j<jmax; j++) {
2143       l   = 0;
2144       row = irow_i[j];
2145       while (row >= C->rmap->range[l+1]) l++;
2146       proc = l;
2147       w4[proc]++;
2148     }
2149     for (j=0; j<size; j++) {
2150       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
2151     }
2152   }
2153 
2154   nrqs     = 0;              /* no of outgoing messages */
2155   msz      = 0;              /* total mesg length (for all procs) */
2156   w1[rank] = 0;              /* no mesg sent to self */
2157   w3[rank] = 0;
2158   for (i=0; i<size; i++) {
2159     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2160   }
2161   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2162   for (i=0,j=0; i<size; i++) {
2163     if (w1[i]) { pa[j] = i; j++; }
2164   }
2165 
2166   /* Each message would have a header = 1 + 2*(no of IS) + data */
2167   for (i=0; i<nrqs; i++) {
2168     j      = pa[i];
2169     w1[j] += w2[j] + 2* w3[j];
2170     msz   += w1[j];
2171   }
2172   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2173 
2174   /* Determine the number of messages to expect, their lengths, from from-ids */
2175   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2176   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2177 
2178   /* Now post the Irecvs corresponding to these messages */
2179   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2180 
2181   ierr = PetscFree(onodes1);CHKERRQ(ierr);
2182   ierr = PetscFree(olengths1);CHKERRQ(ierr);
2183 
2184   /* Allocate Memory for outgoing messages */
2185   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2186   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2187   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2188 
2189   {
2190     PetscInt *iptr = tmp;
2191     k    = 0;
2192     for (i=0; i<nrqs; i++) {
2193       j        = pa[i];
2194       iptr    += k;
2195       sbuf1[j] = iptr;
2196       k        = w1[j];
2197     }
2198   }
2199 
2200   /* Form the outgoing messages */
2201   /* Initialize the header space */
2202   for (i=0; i<nrqs; i++) {
2203     j           = pa[i];
2204     sbuf1[j][0] = 0;
2205     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2206     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2207   }
2208 
2209   /* Parse the isrow and copy data into outbuf */
2210   for (i=0; i<ismax; i++) {
2211     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2212     irow_i = irow[i];
2213     jmax   = nrow[i];
2214     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2215       l   = 0;
2216       row = irow_i[j];
2217       while (row >= C->rmap->range[l+1]) l++;
2218       proc = l;
2219       if (proc != rank) { /* copy to the outgoing buf*/
2220         ctr[proc]++;
2221         *ptr[proc] = row;
2222         ptr[proc]++;
2223       }
2224     }
2225     /* Update the headers for the current IS */
2226     for (j=0; j<size; j++) { /* Can Optimise this loop too */
2227       if ((ctr_j = ctr[j])) {
2228         sbuf1_j        = sbuf1[j];
2229         k              = ++sbuf1_j[0];
2230         sbuf1_j[2*k]   = ctr_j;
2231         sbuf1_j[2*k-1] = i;
2232       }
2233     }
2234   }
2235 
2236   /*  Now  post the sends */
2237   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2238   for (i=0; i<nrqs; ++i) {
2239     j    = pa[i];
2240     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2241   }
2242 
2243   /* Post Receives to capture the buffer size */
2244   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2245   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
2246   rbuf2[0] = tmp + msz;
2247   for (i=1; i<nrqs; ++i) {
2248     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2249   }
2250   for (i=0; i<nrqs; ++i) {
2251     j    = pa[i];
2252     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
2253   }
2254 
2255   /* Send to other procs the buf size they should allocate */
2256 
2257 
2258   /* Receive messages*/
2259   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2260   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2261   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
2262   {
2263     PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2264     PetscInt   *sbuf2_i;
2265 
2266     for (i=0; i<nrqr; ++i) {
2267       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
2268 
2269       req_size[idex] = 0;
2270       rbuf1_i        = rbuf1[idex];
2271       start          = 2*rbuf1_i[0] + 1;
2272       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2273       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
2274       sbuf2_i        = sbuf2[idex];
2275       for (j=start; j<end; j++) {
2276         id              = rbuf1_i[j] - rstart;
2277         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2278         sbuf2_i[j]      = ncols;
2279         req_size[idex] += ncols;
2280       }
2281       req_source[idex] = r_status1[i].MPI_SOURCE;
2282       /* form the header */
2283       sbuf2_i[0] = req_size[idex];
2284       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2285 
2286       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
2287     }
2288   }
2289   ierr = PetscFree(r_status1);CHKERRQ(ierr);
2290   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2291 
2292   /*  recv buffer sizes */
2293   /* Receive messages*/
2294 
2295   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
2296   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2297   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2298   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2299   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2300 
2301   for (i=0; i<nrqs; ++i) {
2302     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
2303     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
2304     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
2305     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
2306     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
2307   }
2308   ierr = PetscFree(r_status2);CHKERRQ(ierr);
2309   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2310 
2311   /* Wait on sends1 and sends2 */
2312   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2313   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2314 
2315   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2316   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2317   ierr = PetscFree(s_status1);CHKERRQ(ierr);
2318   ierr = PetscFree(s_status2);CHKERRQ(ierr);
2319   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2320   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2321 
2322   /* Now allocate buffers for a->j, and send them off */
2323   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2324   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2325   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2326   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2327 
2328   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2329   {
2330     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2331     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2332     PetscInt cend = C->cmap->rend;
2333     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2334 
2335     for (i=0; i<nrqr; i++) {
2336       rbuf1_i   = rbuf1[i];
2337       sbuf_aj_i = sbuf_aj[i];
2338       ct1       = 2*rbuf1_i[0] + 1;
2339       ct2       = 0;
2340       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2341         kmax = rbuf1[i][2*j];
2342         for (k=0; k<kmax; k++,ct1++) {
2343           row    = rbuf1_i[ct1] - rstart;
2344           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2345           ncols  = nzA + nzB;
2346           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2347 
2348           /* load the column indices for this row into cols*/
2349           cols = sbuf_aj_i + ct2;
2350 
2351           lwrite = 0;
2352           for (l=0; l<nzB; l++) {
2353             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2354           }
2355           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2356           for (l=0; l<nzB; l++) {
2357             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2358           }
2359 
2360           ct2 += ncols;
2361         }
2362       }
2363       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
2364     }
2365   }
2366   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
2367   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
2368 
2369   /* Allocate buffers for a->a, and send them off */
2370   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2371   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2372   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2373   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2374 
2375   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2376   {
2377     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2378     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2379     PetscInt    cend   = C->cmap->rend;
2380     PetscInt    *b_j   = b->j;
2381     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2382 
2383     for (i=0; i<nrqr; i++) {
2384       rbuf1_i   = rbuf1[i];
2385       sbuf_aa_i = sbuf_aa[i];
2386       ct1       = 2*rbuf1_i[0]+1;
2387       ct2       = 0;
2388       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2389         kmax = rbuf1_i[2*j];
2390         for (k=0; k<kmax; k++,ct1++) {
2391           row    = rbuf1_i[ct1] - rstart;
2392           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2393           ncols  = nzA + nzB;
2394           cworkB = b_j + b_i[row];
2395           vworkA = a_a + a_i[row];
2396           vworkB = b_a + b_i[row];
2397 
2398           /* load the column values for this row into vals*/
2399           vals = sbuf_aa_i+ct2;
2400 
2401           lwrite = 0;
2402           for (l=0; l<nzB; l++) {
2403             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2404           }
2405           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2406           for (l=0; l<nzB; l++) {
2407             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2408           }
2409 
2410           ct2 += ncols;
2411         }
2412       }
2413       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
2414     }
2415   }
2416   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
2417   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
2418   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2419   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2420 
2421   /* Form the matrix */
2422   /* create col map: global col of C -> local col of submatrices */
2423   {
2424     const PetscInt *icol_i;
2425 #if defined(PETSC_USE_CTABLE)
2426     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
2427     for (i=0; i<ismax; i++) {
2428       if (!allcolumns[i]) {
2429         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2430 
2431         jmax   = ncol[i];
2432         icol_i = icol[i];
2433         cmap_i = cmap[i];
2434         for (j=0; j<jmax; j++) {
2435           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2436         }
2437       } else {
2438         cmap[i] = NULL;
2439       }
2440     }
2441 #else
2442     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
2443     for (i=0; i<ismax; i++) {
2444       if (!allcolumns[i]) {
2445         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2446         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2447         jmax   = ncol[i];
2448         icol_i = icol[i];
2449         cmap_i = cmap[i];
2450         for (j=0; j<jmax; j++) {
2451           cmap_i[icol_i[j]] = j+1;
2452         }
2453       } else {
2454         cmap[i] = NULL;
2455       }
2456     }
2457 #endif
2458   }
2459 
2460   /* Create lens which is required for MatCreate... */
2461   for (i=0,j=0; i<ismax; i++) j += nrow[i];
2462   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2463   if (ismax) {
2464     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
2465     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
2466   }
2467   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2468 
2469   /* Update lens from local data */
2470   for (i=0; i<ismax; i++) {
2471     jmax = nrow[i];
2472     if (!allcolumns[i]) cmap_i = cmap[i];
2473     irow_i = irow[i];
2474     lens_i = lens[i];
2475     for (j=0; j<jmax; j++) {
2476       l   = 0;
2477       row = irow_i[j];
2478       while (row >= C->rmap->range[l+1]) l++;
2479       proc = l;
2480       if (proc == rank) {
2481         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2482         if (!allcolumns[i]) {
2483           for (k=0; k<ncols; k++) {
2484 #if defined(PETSC_USE_CTABLE)
2485             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2486 #else
2487             tcol = cmap_i[cols[k]];
2488 #endif
2489             if (tcol) lens_i[j]++;
2490           }
2491         } else { /* allcolumns */
2492           lens_i[j] = ncols;
2493         }
2494         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2495       }
2496     }
2497   }
2498 
2499   /* Create row map: global row of C -> local row of submatrices */
2500 #if defined(PETSC_USE_CTABLE)
2501   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
2502   for (i=0; i<ismax; i++) {
2503     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2504     irow_i = irow[i];
2505     jmax   = nrow[i];
2506     for (j=0; j<jmax; j++) {
2507       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2508     }
2509   }
2510 #else
2511   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
2512   if (ismax) {
2513     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
2514     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2515   }
2516   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
2517   for (i=0; i<ismax; i++) {
2518     rmap_i = rmap[i];
2519     irow_i = irow[i];
2520     jmax   = nrow[i];
2521     for (j=0; j<jmax; j++) {
2522       rmap_i[irow_i[j]] = j;
2523     }
2524   }
2525 #endif
2526 
2527   /* Update lens from offproc data */
2528   {
2529     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2530 
2531     for (tmp2=0; tmp2<nrqs; tmp2++) {
2532       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
2533       idex    = pa[idex2];
2534       sbuf1_i = sbuf1[idex];
2535       jmax    = sbuf1_i[0];
2536       ct1     = 2*jmax+1;
2537       ct2     = 0;
2538       rbuf2_i = rbuf2[idex2];
2539       rbuf3_i = rbuf3[idex2];
2540       for (j=1; j<=jmax; j++) {
2541         is_no  = sbuf1_i[2*j-1];
2542         max1   = sbuf1_i[2*j];
2543         lens_i = lens[is_no];
2544         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2545         rmap_i = rmap[is_no];
2546         for (k=0; k<max1; k++,ct1++) {
2547 #if defined(PETSC_USE_CTABLE)
2548           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2549           row--;
2550           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2551 #else
2552           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2553 #endif
2554           max2 = rbuf2_i[ct1];
2555           for (l=0; l<max2; l++,ct2++) {
2556             if (!allcolumns[is_no]) {
2557 #if defined(PETSC_USE_CTABLE)
2558               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2559 #else
2560               tcol = cmap_i[rbuf3_i[ct2]];
2561 #endif
2562               if (tcol) lens_i[row]++;
2563             } else { /* allcolumns */
2564               lens_i[row]++; /* lens_i[row] += max2 ? */
2565             }
2566           }
2567         }
2568       }
2569     }
2570   }
2571   ierr = PetscFree(r_status3);CHKERRQ(ierr);
2572   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2573   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2574   ierr = PetscFree(s_status3);CHKERRQ(ierr);
2575   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2576 
2577   /* Create the submatrices */
2578   if (scall == MAT_REUSE_MATRIX) {
2579     PetscBool flag;
2580 
2581     /*
2582         Assumes new rows are same length as the old rows,hence bug!
2583     */
2584     for (i=0; i<ismax; i++) {
2585       mat = (Mat_SeqAIJ*)(submats[i]->data);
2586       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2587       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2588       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
2589       /* Initial matrix as if empty */
2590       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2591 
2592       submats[i]->factortype = C->factortype;
2593     }
2594   } else {
2595     for (i=0; i<ismax; i++) {
2596       PetscInt rbs,cbs;
2597       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2598       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2599 
2600       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2601       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2602 
2603       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2604       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2605       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2606     }
2607   }
2608 
2609   /* Assemble the matrices */
2610   /* First assemble the local rows */
2611   {
2612     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2613     PetscScalar *imat_a;
2614 
2615     for (i=0; i<ismax; i++) {
2616       mat       = (Mat_SeqAIJ*)submats[i]->data;
2617       imat_ilen = mat->ilen;
2618       imat_j    = mat->j;
2619       imat_i    = mat->i;
2620       imat_a    = mat->a;
2621 
2622       if (!allcolumns[i]) cmap_i = cmap[i];
2623       rmap_i = rmap[i];
2624       irow_i = irow[i];
2625       jmax   = nrow[i];
2626       for (j=0; j<jmax; j++) {
2627         l   = 0;
2628         row = irow_i[j];
2629         while (row >= C->rmap->range[l+1]) l++;
2630         proc = l;
2631         if (proc == rank) {
2632           old_row = row;
2633 #if defined(PETSC_USE_CTABLE)
2634           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2635           row--;
2636 #else
2637           row = rmap_i[row];
2638 #endif
2639           ilen_row = imat_ilen[row];
2640           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2641           mat_i    = imat_i[row];
2642           mat_a    = imat_a + mat_i;
2643           mat_j    = imat_j + mat_i;
2644           if (!allcolumns[i]) {
2645             for (k=0; k<ncols; k++) {
2646 #if defined(PETSC_USE_CTABLE)
2647               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2648 #else
2649               tcol = cmap_i[cols[k]];
2650 #endif
2651               if (tcol) {
2652                 *mat_j++ = tcol - 1;
2653                 *mat_a++ = vals[k];
2654                 ilen_row++;
2655               }
2656             }
2657           } else { /* allcolumns */
2658             for (k=0; k<ncols; k++) {
2659               *mat_j++ = cols[k];  /* global col index! */
2660               *mat_a++ = vals[k];
2661               ilen_row++;
2662             }
2663           }
2664           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2665 
2666           imat_ilen[row] = ilen_row;
2667         }
2668       }
2669     }
2670   }
2671 
2672   /*   Now assemble the off proc rows*/
2673   {
2674     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
2675     PetscInt    *imat_j,*imat_i;
2676     PetscScalar *imat_a,*rbuf4_i;
2677 
2678     for (tmp2=0; tmp2<nrqs; tmp2++) {
2679       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
2680       idex    = pa[idex2];
2681       sbuf1_i = sbuf1[idex];
2682       jmax    = sbuf1_i[0];
2683       ct1     = 2*jmax + 1;
2684       ct2     = 0;
2685       rbuf2_i = rbuf2[idex2];
2686       rbuf3_i = rbuf3[idex2];
2687       rbuf4_i = rbuf4[idex2];
2688       for (j=1; j<=jmax; j++) {
2689         is_no     = sbuf1_i[2*j-1];
2690         rmap_i    = rmap[is_no];
2691         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2692         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
2693         imat_ilen = mat->ilen;
2694         imat_j    = mat->j;
2695         imat_i    = mat->i;
2696         imat_a    = mat->a;
2697         max1      = sbuf1_i[2*j];
2698         for (k=0; k<max1; k++,ct1++) {
2699           row = sbuf1_i[ct1];
2700 #if defined(PETSC_USE_CTABLE)
2701           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2702           row--;
2703 #else
2704           row = rmap_i[row];
2705 #endif
2706           ilen  = imat_ilen[row];
2707           mat_i = imat_i[row];
2708           mat_a = imat_a + mat_i;
2709           mat_j = imat_j + mat_i;
2710           max2  = rbuf2_i[ct1];
2711           if (!allcolumns[is_no]) {
2712             for (l=0; l<max2; l++,ct2++) {
2713 
2714 #if defined(PETSC_USE_CTABLE)
2715               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2716 #else
2717               tcol = cmap_i[rbuf3_i[ct2]];
2718 #endif
2719               if (tcol) {
2720                 *mat_j++ = tcol - 1;
2721                 *mat_a++ = rbuf4_i[ct2];
2722                 ilen++;
2723               }
2724             }
2725           } else { /* allcolumns */
2726             for (l=0; l<max2; l++,ct2++) {
2727               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2728               *mat_a++ = rbuf4_i[ct2];
2729               ilen++;
2730             }
2731           }
2732           imat_ilen[row] = ilen;
2733         }
2734       }
2735     }
2736   }
2737 
2738   /* sort the rows */
2739   {
2740     PetscInt    *imat_ilen,*imat_j,*imat_i;
2741     PetscScalar *imat_a;
2742 
2743     for (i=0; i<ismax; i++) {
2744       mat       = (Mat_SeqAIJ*)submats[i]->data;
2745       imat_j    = mat->j;
2746       imat_i    = mat->i;
2747       imat_a    = mat->a;
2748       imat_ilen = mat->ilen;
2749 
2750       if (allcolumns[i]) continue;
2751       jmax = nrow[i];
2752       for (j=0; j<jmax; j++) {
2753         PetscInt ilen;
2754 
2755         mat_i = imat_i[j];
2756         mat_a = imat_a + mat_i;
2757         mat_j = imat_j + mat_i;
2758         ilen  = imat_ilen[j];
2759         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
2760       }
2761     }
2762   }
2763 
2764   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2765   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2766   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2767   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2768   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2769 
2770   /* Restore the indices */
2771   for (i=0; i<ismax; i++) {
2772     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2773     if (!allcolumns[i]) {
2774       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2775     }
2776   }
2777 
2778   /* Destroy allocated memory */
2779   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
2780   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2781   ierr = PetscFree(pa);CHKERRQ(ierr);
2782 
2783   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
2784   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
2785   for (i=0; i<nrqr; ++i) {
2786     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
2787   }
2788   for (i=0; i<nrqs; ++i) {
2789     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
2790     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2791   }
2792 
2793   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
2794   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
2795   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2796   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2797   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2798   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2799   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2800 
2801 #if defined(PETSC_USE_CTABLE)
2802   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
2803 #else
2804   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
2805 #endif
2806   ierr = PetscFree(rmap);CHKERRQ(ierr);
2807 
2808   for (i=0; i<ismax; i++) {
2809     if (!allcolumns[i]) {
2810 #if defined(PETSC_USE_CTABLE)
2811       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
2812 #else
2813       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
2814 #endif
2815     }
2816   }
2817   ierr = PetscFree(cmap);CHKERRQ(ierr);
2818   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2819   ierr = PetscFree(lens);CHKERRQ(ierr);
2820 
2821   for (i=0; i<ismax; i++) {
2822     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2823     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2824   }
2825   PetscFunctionReturn(0);
2826 }
2827 
2828 /*
2829  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2830  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2831  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2832  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2833  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2834  state, and needs to be "assembled" later by compressing B's column space.
2835 
2836  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2837  Following this call, C->A & C->B have been created, even if empty.
2838  */
2839 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2840 {
2841   /* If making this function public, change the error returned in this function away from _PLIB. */
2842   PetscErrorCode ierr;
2843   Mat_MPIAIJ     *aij;
2844   Mat_SeqAIJ     *Baij;
2845   PetscBool      seqaij,Bdisassembled;
2846   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2847   PetscScalar    v;
2848   const PetscInt *rowindices,*colindices;
2849 
2850   PetscFunctionBegin;
2851   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2852   if (A) {
2853     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2854     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2855     if (rowemb) {
2856       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2857       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2858     } else {
2859       if (C->rmap->n != A->rmap->n) {
2860 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2861       }
2862     }
2863     if (dcolemb) {
2864       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2865       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2866     } else {
2867       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2868     }
2869   }
2870   if (B) {
2871     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2872     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2873     if (rowemb) {
2874       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2875       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2876     } else {
2877       if (C->rmap->n != B->rmap->n) {
2878 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2879       }
2880     }
2881     if (ocolemb) {
2882       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2883       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2884     } else {
2885       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2886     }
2887   }
2888 
2889   aij    = (Mat_MPIAIJ*)(C->data);
2890   if (!aij->A) {
2891     /* Mimic parts of MatMPIAIJSetPreallocation() */
2892     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2893     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2894     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2895     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2896     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2897   }
2898   if (A) {
2899     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2900   } else {
2901     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2902   }
2903   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2904     /*
2905       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2906       need to "disassemble" B -- convert it to using C's global indices.
2907       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2908 
2909       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2910       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2911 
2912       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2913       At least avoid calling MatSetValues() and the implied searches?
2914     */
2915 
2916     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2917 #if defined(PETSC_USE_CTABLE)
2918       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2919 #else
2920       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2921       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2922       if (aij->B) {
2923         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2924       }
2925 #endif
2926       ngcol = 0;
2927       if (aij->lvec) {
2928 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2929       }
2930       if (aij->garray) {
2931 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2932 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2933       }
2934       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2935       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2936     }
2937     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2938       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2939     }
2940     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2941       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2942     }
2943   }
2944   Bdisassembled = PETSC_FALSE;
2945   if (!aij->B) {
2946     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2947     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2948     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2949     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2950     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2951     Bdisassembled = PETSC_TRUE;
2952   }
2953   if (B) {
2954     Baij = (Mat_SeqAIJ*)(B->data);
2955     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2956       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2957       for (i=0; i<B->rmap->n; i++) {
2958 	nz[i] = Baij->i[i+1] - Baij->i[i];
2959       }
2960       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2961       ierr = PetscFree(nz);CHKERRQ(ierr);
2962     }
2963 
2964     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
2965     shift = rend-rstart;
2966     count = 0;
2967     rowindices = NULL;
2968     colindices = NULL;
2969     if (rowemb) {
2970       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
2971     }
2972     if (ocolemb) {
2973       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
2974     }
2975     for (i=0; i<B->rmap->n; i++) {
2976       PetscInt row;
2977       row = i;
2978       if (rowindices) row = rowindices[i];
2979       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2980 	col  = Baij->j[count];
2981 	if (colindices) col = colindices[col];
2982 	if (Bdisassembled && col>=rstart) col += shift;
2983 	v    = Baij->a[count];
2984 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
2985 	++count;
2986       }
2987     }
2988     /* No assembly for aij->B is necessary. */
2989     /* FIXME: set aij->B's nonzerostate correctly. */
2990   } else {
2991     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
2992   }
2993   C->preallocated  = PETSC_TRUE;
2994   C->was_assembled = PETSC_FALSE;
2995   C->assembled     = PETSC_FALSE;
2996    /*
2997       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2998       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2999    */
3000   PetscFunctionReturn(0);
3001 }
3002 
3003 /*
3004   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3005  */
3006 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3007 {
3008   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3009 
3010   PetscFunctionBegin;
3011   PetscValidPointer(A,2);
3012   PetscValidPointer(B,3);
3013   /* FIXME: make sure C is assembled */
3014   *A = aij->A;
3015   *B = aij->B;
3016   /* Note that we don't incref *A and *B, so be careful! */
3017   PetscFunctionReturn(0);
3018 }
3019 
3020 /*
3021   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3022   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3023 */
3024 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3025                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3026 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3027 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3028 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3029 {
3030   PetscErrorCode ierr;
3031   PetscMPIInt    isize,flag;
3032   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
3033   Mat            *A,*B;
3034   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3035 
3036   PetscFunctionBegin;
3037   if (!ismax) PetscFunctionReturn(0);
3038 
3039   for (i = 0, cismax = 0; i < ismax; ++i) {
3040     PetscMPIInt isize;
3041     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3042     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3043     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3044     if (isize > 1) ++cismax;
3045   }
3046   /*
3047      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3048      ispar counts the number of parallel ISs across C's comm.
3049   */
3050   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3051   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3052     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3053     PetscFunctionReturn(0);
3054   }
3055 
3056   /* if (ispar) */
3057   /*
3058     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3059     These are used to extract the off-diag portion of the resulting parallel matrix.
3060     The row IS for the off-diag portion is the same as for the diag portion,
3061     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3062   */
3063   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3064   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3065   for (i = 0, ii = 0; i < ismax; ++i) {
3066     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3067     if (isize > 1) {
3068       /*
3069 	 TODO: This is the part that's ***NOT SCALABLE***.
3070 	 To fix this we need to extract just the indices of C's nonzero columns
3071 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3072 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3073 	 to be done without serializing on the IS list, so, most likely, it is best
3074 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3075       */
3076       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3077       /* Now we have to
3078 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3079 	     were sorted on each rank, concatenated they might no longer be sorted;
3080 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3081 	     indices in the nondecreasing order to the original index positions.
3082 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3083       */
3084       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3085       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3086       ++ii;
3087     }
3088   }
3089   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3090   for (i = 0, ii = 0; i < ismax; ++i) {
3091     PetscInt       j,issize;
3092     const PetscInt *indices;
3093 
3094     /*
3095        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3096      */
3097     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3098     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3099     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3100     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3101     for (j = 1; j < issize; ++j) {
3102       if (indices[j] == indices[j-1]) {
3103 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3104       }
3105     }
3106     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3107 
3108 
3109     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3110     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3111     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3112     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3113     for (j = 1; j < issize; ++j) {
3114       if (indices[j-1] == indices[j]) {
3115 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3116       }
3117     }
3118     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3119     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3120     if (isize > 1) {
3121       cisrow[ii] = isrow[i];
3122       ++ii;
3123     }
3124   }
3125   /*
3126     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3127     array of sequential matrices underlying the resulting parallel matrices.
3128     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3129     contain duplicates.
3130 
3131     There are as many diag matrices as there are original index sets. There are only as many parallel
3132     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3133 
3134     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3135     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3136       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3137       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3138       with A[i] and B[ii] extracted from the corresponding MPI submat.
3139     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3140       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3141       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3142       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3143       values into A[i] and B[ii] sitting inside the corresponding submat.
3144     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3145       A[i], B[ii], AA[i] or BB[ii] matrices.
3146   */
3147   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3148   if (scall == MAT_INITIAL_MATRIX) {
3149     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3150     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
3151   } else {
3152     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
3153     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
3154     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
3155     for (i = 0, ii = 0; i < ismax; ++i) {
3156       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3157       if (isize > 1) {
3158 	Mat AA,BB;
3159 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
3160 	if (!isrow_p[i] && !iscol_p[i]) {
3161 	  A[i] = AA;
3162 	} else {
3163 	  /* TODO: extract A[i] composed on AA. */
3164 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3165 	}
3166 	if (!isrow_p[i] && !ciscol_p[ii]) {
3167 	  B[ii] = BB;
3168 	} else {
3169 	  /* TODO: extract B[ii] composed on BB. */
3170 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
3171 	}
3172 	++ii;
3173       } else {
3174 	if (!isrow_p[i] && !iscol_p[i]) {
3175 	  A[i] = (*submat)[i];
3176 	} else {
3177 	  /* TODO: extract A[i] composed on (*submat)[i]. */
3178 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3179 	}
3180       }
3181     }
3182   }
3183   /* Now obtain the sequential A and B submatrices separately. */
3184   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
3185   /* I did not figure out a good way to fix it right now.
3186    * Local column size of B[i] is different from the size of ciscol[i].
3187    * B[i]'s size is finally determined by MatAssembly, while
3188    * ciscol[i] is computed as the complement of iscol[i].
3189    * It is better to keep only nonzero indices when computing
3190    * the complement ciscol[i].
3191    * */
3192   if(scall==MAT_REUSE_MATRIX){
3193 	for(i=0; i<cismax; i++){
3194 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
3195 	  B[i]->cmap->n = ciscol_localsize;
3196 	}
3197   }
3198   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
3199   /*
3200     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3201     matrices A & B have been extracted directly into the parallel matrices containing them, or
3202     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3203     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3204     to have the same sparsity pattern.
3205     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3206     must be constructed for C. This is done by setseqmat(s).
3207   */
3208   for (i = 0, ii = 0; i < ismax; ++i) {
3209     /*
3210        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3211        That way we can avoid sorting and computing permutations when reusing.
3212        To this end:
3213         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3214 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3215 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3216     */
3217     MatStructure pattern;
3218     if (scall == MAT_INITIAL_MATRIX) {
3219       pattern = DIFFERENT_NONZERO_PATTERN;
3220     } else {
3221       pattern = SAME_NONZERO_PATTERN;
3222     }
3223     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3224     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3225     if (isize > 1) {
3226       if (scall == MAT_INITIAL_MATRIX) {
3227 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3228 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3229 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3230 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3231 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3232       }
3233       /*
3234 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3235       */
3236       {
3237 	Mat AA,BB;
3238 	AA = NULL;
3239 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
3240 	BB = NULL;
3241 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
3242 	if (AA || BB) {
3243 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3244 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3245 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3246 	}
3247 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
3248 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
3249 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
3250 	}
3251 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
3252 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
3253 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
3254 	}
3255       }
3256       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3257       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3258       ++ii;
3259     } else { /* if (isize == 1) */
3260       if (scall == MAT_INITIAL_MATRIX) {
3261 	if (isrow_p[i] || iscol_p[i]) {
3262 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3263 	} else (*submat)[i] = A[i];
3264       }
3265       if (isrow_p[i] || iscol_p[i]) {
3266 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3267 	/* Otherwise A is extracted straight into (*submats)[i]. */
3268 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3269 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3270       }
3271     }
3272     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3273     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3274   }
3275   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3276   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3277   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3278   ierr = PetscFree(A);CHKERRQ(ierr);
3279   ierr = PetscFree(B);CHKERRQ(ierr);
3280   PetscFunctionReturn(0);
3281 }
3282 
3283 
3284 
3285 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3286 {
3287   PetscErrorCode ierr;
3288 
3289   PetscFunctionBegin;
3290   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3291   PetscFunctionReturn(0);
3292 }
3293