xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 2bf68e3e0f2a61f71e7c65bee250bfa1c8ce0cdb)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 
22 #undef __FUNCT__
23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
25 {
26   PetscErrorCode ierr;
27   PetscInt       i;
28 
29   PetscFunctionBegin;
30   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
31   for (i=0; i<ov; ++i) {
32     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
33   }
34   PetscFunctionReturn(0);
35 }
36 
37 #undef __FUNCT__
38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable"
39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
40 {
41   PetscErrorCode ierr;
42   PetscInt       i;
43 
44   PetscFunctionBegin;
45   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
46   for (i=0; i<ov; ++i) {
47     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
48   }
49   PetscFunctionReturn(0);
50 }
51 
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
56 {
57   PetscErrorCode   ierr;
58   MPI_Comm         comm;
59   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
60   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
61   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
62   const PetscInt  *indices_i,**indices;
63   PetscLayout      rmap;
64   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
65   PetscSF          sf;
66   PetscSFNode     *remote;
67 
68   PetscFunctionBegin;
69   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
70   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
71   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
72   /* get row map to determine where rows should be going */
73   ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr);
74   /* retrieve IS data and put all together so that we
75    * can optimize communication
76    *  */
77   ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
78   for (i=0,tlength=0; i<nidx; i++){
79     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
80     tlength += length[i];
81     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
82   }
83   /* find these rows on remote processors */
84   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
85   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
86   nrrows = 0;
87   for (i=0; i<nidx; i++){
88     length_i     = length[i];
89     indices_i    = indices[i];
90     for (j=0; j<length_i; j++){
91       owner = -1;
92       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
93       /* remote processors */
94       if (owner != rank){
95         tosizes_temp[owner]++; /* number of rows to owner */
96         rrow_ranks[nrrows]  = owner; /* processor */
97         rrow_isids[nrrows]   = i; /* is id */
98         remoterows[nrrows++] = indices_i[j]; /* row */
99       }
100     }
101     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
102   }
103   ierr = PetscFree2(indices,length);CHKERRQ(ierr);
104   /* test if we need to exchange messages
105    * generally speaking, we do not need to exchange
106    * data when overlap is 1
107    * */
108   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr);
109   /* we do not have any messages
110    * It usually corresponds to overlap 1
111    * */
112   if (!reducednrrows){
113     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
114     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
115     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
116     PetscFunctionReturn(0);
117   }
118   nto = 0;
119   /* send sizes and ranks for building a two-sided communcation */
120   for (i=0; i<size; i++){
121    if (tosizes_temp[i]){
122      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
123      tosizes_temp[i] = nto; /* a map from processor to index */
124      toranks[nto++]  = i; /* processor */
125    }
126   }
127   ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr);
128   for (i=0; i<nto; i++){
129     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
130     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
131   }
132   /* send information to other processors */
133   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
134   nrecvrows = 0;
135   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
136   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
137   nrecvrows = 0;
138   for (i=0; i<nfrom; i++){
139     for (j=0; j<fromsizes[2*i]; j++){
140       remote[nrecvrows].rank    = fromranks[i];
141       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
142     }
143   }
144   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
145   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
146   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
147   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
148   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
149   /* message pair <no of is, row>  */
150   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
151   for (i=0; i<nrrows; i++){
152     owner = rrow_ranks[i]; /* processor */
153     j     = tosizes_temp[owner]; /* index */
154     todata[toffsets[j]++] = rrow_isids[i];
155     todata[toffsets[j]++] = remoterows[i];
156   }
157   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
158   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
159   ierr = PetscFree(toffsets);CHKERRQ(ierr);
160   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
161   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr);
162   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
163   /* send rows belonging to the remote so that then we could get the overlapping data back */
164   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
165   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
166   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
167   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
168   ierr = PetscFree(fromranks);CHKERRQ(ierr);
169   nrecvrows = 0;
170   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
171   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
172   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
173   nrecvrows = 0;
174   for (i=0; i<nto; i++){
175     for (j=0; j<tosizes[2*i]; j++){
176       remote[nrecvrows].rank    = toranks[i];
177       remote[nrecvrows++].index = tosizes[2*i+1]+j;
178     }
179   }
180   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
181   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
182   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
183   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
184   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
185   /* overlap communication and computation */
186   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
187   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
188   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr);
189   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
190   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
191   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
192   ierr = PetscFree(toranks);CHKERRQ(ierr);
193   ierr = PetscFree(tosizes);CHKERRQ(ierr);
194   ierr = PetscFree(todata);CHKERRQ(ierr);
195   PetscFunctionReturn(0);
196 }
197 
198 #undef __FUNCT__
199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable"
200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
201 {
202   PetscInt         *isz,isz_i,i,j,is_id, data_size;
203   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
204   const PetscInt   *indices_i_temp;
205   PetscErrorCode    ierr;
206 
207   PetscFunctionBegin;
208   max_lsize = 0;
209   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
210   for (i=0; i<nidx; i++){
211     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
212     max_lsize = lsize>max_lsize ? lsize:max_lsize;
213     isz[i]    = lsize;
214   }
215   ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr);
216   for (i=0; i<nidx; i++){
217     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
218     ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr);
219     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
220     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
221   }
222   /* retrieve information to get row id and its overlap */
223   for (i=0; i<nrecvs; ){
224     is_id      = recvdata[i++];
225     data_size  = recvdata[i++];
226     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
227     isz_i      = isz[is_id];
228     for (j=0; j< data_size; j++){
229       col = recvdata[i++];
230       indices_i[isz_i++] = col;
231     }
232     isz[is_id] = isz_i;
233   }
234   /* remove duplicate entities */
235   for (i=0; i<nidx; i++){
236     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
237     isz_i      = isz[i];
238     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
239     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
240   }
241   ierr = PetscFree(isz);CHKERRQ(ierr);
242   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable"
248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
249 {
250   PetscLayout       rmap,cmap;
251   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
252   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
253   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
254   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
255   Mat               amat,bmat;
256   PetscMPIInt       rank;
257   PetscBool         done;
258   MPI_Comm          comm;
259   PetscErrorCode    ierr;
260 
261   PetscFunctionBegin;
262   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
263   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
264   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
265   /* Even if the mat is symmetric, we still assume it is not symmetric */
266   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
267   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
268   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
269   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
270   /* total number of nonzero values is used to estimate the memory usage in the next step */
271   tnz  = ai[an]+bi[bn];
272   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
273   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
274   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
275   /* to find the longest message */
276   max_fszs = 0;
277   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
278   /* better way to estimate number of nonzero in the mat??? */
279   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
280   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
281   rows_pos  = 0;
282   totalrows = 0;
283   for (i=0; i<nfrom; i++){
284     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
285     /* group data together */
286     for (j=0; j<fromsizes[2*i]; j+=2){
287       is_id                       = fromrows[rows_pos++];/* no of is */
288       rows_i                      = rows_data[is_id];
289       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
290     }
291     /* estimate a space to avoid multiple allocations  */
292     for (j=0; j<nidx; j++){
293       indvc_ij = 0;
294       rows_i   = rows_data[j];
295       for (l=0; l<rows_pos_i[j]; l++){
296         row    = rows_i[l]-rstart;
297         start  = ai[row];
298         end    = ai[row+1];
299         for (k=start; k<end; k++){ /* Amat */
300           col = aj[k] + cstart;
301           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
302         }
303         start = bi[row];
304         end   = bi[row+1];
305         for (k=start; k<end; k++) { /* Bmat */
306           col = gcols[bj[k]];
307           indices_tmp[indvc_ij++] = col;
308         }
309       }
310       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
311       indv_counts[i*nidx+j] = indvc_ij;
312       totalrows            += indvc_ij;
313     }
314   }
315   /* message triple <no of is, number of rows, rows> */
316   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
317   totalrows = 0;
318   rows_pos  = 0;
319   /* use this code again */
320   for (i=0;i<nfrom;i++){
321     ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr);
322     for (j=0; j<fromsizes[2*i]; j+=2){
323       is_id                       = fromrows[rows_pos++];
324       rows_i                      = rows_data[is_id];
325       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
326     }
327     /* add data  */
328     for (j=0; j<nidx; j++){
329       if (!indv_counts[i*nidx+j]) continue;
330       indvc_ij = 0;
331       sbdata[totalrows++] = j;
332       sbdata[totalrows++] = indv_counts[i*nidx+j];
333       sbsizes[2*i]       += 2;
334       rows_i              = rows_data[j];
335       for (l=0; l<rows_pos_i[j]; l++){
336         row   = rows_i[l]-rstart;
337         start = ai[row];
338         end   = ai[row+1];
339         for (k=start; k<end; k++){ /* Amat */
340           col = aj[k] + cstart;
341           indices_tmp[indvc_ij++] = col;
342         }
343         start = bi[row];
344         end   = bi[row+1];
345         for (k=start; k<end; k++) { /* Bmat */
346           col = gcols[bj[k]];
347           indices_tmp[indvc_ij++] = col;
348         }
349       }
350       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
351       sbsizes[2*i]  += indvc_ij;
352       ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr);
353       totalrows += indvc_ij;
354     }
355   }
356   ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
357   for (i=0; i<nfrom; i++){
358     offsets[i+1]   = offsets[i] + sbsizes[2*i];
359     sbsizes[2*i+1] = offsets[i];
360   }
361   ierr = PetscFree(offsets);CHKERRQ(ierr);
362   if (sbrowsizes) *sbrowsizes = sbsizes;
363   if (sbrows) *sbrows = sbdata;
364   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
365   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
366   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable"
372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
373 {
374   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
375   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
376   PetscInt          lsize,lsize_tmp,owner;
377   PetscMPIInt       rank;
378   Mat               amat,bmat;
379   PetscBool         done;
380   PetscLayout       cmap,rmap;
381   MPI_Comm          comm;
382   PetscErrorCode    ierr;
383 
384   PetscFunctionBegin;
385   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
386   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
387   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
388   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
389   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
390   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
391   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
392   /* is it a safe way to compute number of nonzero values ? */
393   tnz  = ai[an]+bi[bn];
394   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
395   ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr);
396   ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr);
397   /* it is a better way to estimate memory than the old implementation
398    * where global size of matrix is used
399    * */
400   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
401   for (i=0; i<nidx; i++) {
402     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
403     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
404     lsize_tmp = 0;
405     for (j=0; j<lsize; j++) {
406       owner = -1;
407       row   = indices[j];
408       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
409       if (owner != rank) continue;
410       /* local number */
411       row  -= rstart;
412       start = ai[row];
413       end   = ai[row+1];
414       for (k=start; k<end; k++) { /* Amat */
415         col = aj[k] + cstart;
416         indices_temp[lsize_tmp++] = col;
417       }
418       start = bi[row];
419       end   = bi[row+1];
420       for (k=start; k<end; k++) { /* Bmat */
421         col = gcols[bj[k]];
422         indices_temp[lsize_tmp++] = col;
423       }
424     }
425    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
426    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
427    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
428    ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
429   }
430   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
431   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
432   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 
436 
437 /*
438   Sample message format:
439   If a processor A wants processor B to process some elements corresponding
440   to index sets is[1],is[5]
441   mesg [0] = 2   (no of index sets in the mesg)
442   -----------
443   mesg [1] = 1 => is[1]
444   mesg [2] = sizeof(is[1]);
445   -----------
446   mesg [3] = 5  => is[5]
447   mesg [4] = sizeof(is[5]);
448   -----------
449   mesg [5]
450   mesg [n]  datas[1]
451   -----------
452   mesg[n+1]
453   mesg[m]  data(is[5])
454   -----------
455 
456   Notes:
457   nrqs - no of requests sent (or to be sent out)
458   nrqr - no of requests recieved (which have to be or which have been processed
459 */
460 #undef __FUNCT__
461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
463 {
464   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
465   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
466   const PetscInt **idx,*idx_i;
467   PetscInt       *n,**data,len;
468 #if defined(PETSC_USE_CTABLE)
469   PetscTable     *table_data,table_data_i;
470   PetscInt       *tdata,tcount,tcount_max;
471 #else
472   PetscInt       *data_i,*d_p;
473 #endif
474   PetscErrorCode ierr;
475   PetscMPIInt    size,rank,tag1,tag2;
476   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
477   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
478   PetscBT        *table;
479   MPI_Comm       comm;
480   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
481   MPI_Status     *s_status,*recv_status;
482   char           *t_p;
483 
484   PetscFunctionBegin;
485   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
486   size = c->size;
487   rank = c->rank;
488   M    = C->rmap->N;
489 
490   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
491   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
492 
493   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
494 
495   for (i=0; i<imax; i++) {
496     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
497     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
498   }
499 
500   /* evaluate communication - mesg to who,length of mesg, and buffer space
501      required. Based on this, buffers are allocated, and data copied into them  */
502   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
503   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
504   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
505   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
506   for (i=0; i<imax; i++) {
507     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
508     idx_i = idx[i];
509     len   = n[i];
510     for (j=0; j<len; j++) {
511       row = idx_i[j];
512       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
513       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
514       w4[proc]++;
515     }
516     for (j=0; j<size; j++) {
517       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
518     }
519   }
520 
521   nrqs     = 0;              /* no of outgoing messages */
522   msz      = 0;              /* total mesg length (for all proc */
523   w1[rank] = 0;              /* no mesg sent to intself */
524   w3[rank] = 0;
525   for (i=0; i<size; i++) {
526     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
527   }
528   /* pa - is list of processors to communicate with */
529   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
530   for (i=0,j=0; i<size; i++) {
531     if (w1[i]) {pa[j] = i; j++;}
532   }
533 
534   /* Each message would have a header = 1 + 2*(no of IS) + data */
535   for (i=0; i<nrqs; i++) {
536     j      = pa[i];
537     w1[j] += w2[j] + 2*w3[j];
538     msz   += w1[j];
539   }
540 
541   /* Determine the number of messages to expect, their lengths, from from-ids */
542   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
543   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
544 
545   /* Now post the Irecvs corresponding to these messages */
546   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
547 
548   /* Allocate Memory for outgoing messages */
549   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
550   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
551   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
552 
553   {
554     PetscInt *iptr = tmp,ict  = 0;
555     for (i=0; i<nrqs; i++) {
556       j         = pa[i];
557       iptr     +=  ict;
558       outdat[j] = iptr;
559       ict       = w1[j];
560     }
561   }
562 
563   /* Form the outgoing messages */
564   /* plug in the headers */
565   for (i=0; i<nrqs; i++) {
566     j            = pa[i];
567     outdat[j][0] = 0;
568     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
569     ptr[j]       = outdat[j] + 2*w3[j] + 1;
570   }
571 
572   /* Memory for doing local proc's work */
573   {
574     PetscInt M_BPB_imax = 0;
575 #if defined(PETSC_USE_CTABLE)
576     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
577     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
578     for (i=0; i<imax; i++) {
579       ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr);
580     }
581     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
582     for (i=0; i<imax; i++) {
583       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
584     }
585 #else
586     PetscInt Mimax = 0;
587     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
588     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
589     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
590     for (i=0; i<imax; i++) {
591       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
592       data[i]  = d_p + M*i;
593     }
594 #endif
595   }
596 
597   /* Parse the IS and update local tables and the outgoing buf with the data */
598   {
599     PetscInt n_i,isz_i,*outdat_j,ctr_j;
600     PetscBT  table_i;
601 
602     for (i=0; i<imax; i++) {
603       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
604       n_i     = n[i];
605       table_i = table[i];
606       idx_i   = idx[i];
607 #if defined(PETSC_USE_CTABLE)
608       table_data_i = table_data[i];
609 #else
610       data_i  = data[i];
611 #endif
612       isz_i   = isz[i];
613       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
614         row  = idx_i[j];
615         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
616         if (proc != rank) { /* copy to the outgoing buffer */
617           ctr[proc]++;
618           *ptr[proc] = row;
619           ptr[proc]++;
620         } else if (!PetscBTLookupSet(table_i,row)) {
621 #if defined(PETSC_USE_CTABLE)
622           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
623 #else
624           data_i[isz_i] = row; /* Update the local table */
625 #endif
626           isz_i++;
627         }
628       }
629       /* Update the headers for the current IS */
630       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
631         if ((ctr_j = ctr[j])) {
632           outdat_j        = outdat[j];
633           k               = ++outdat_j[0];
634           outdat_j[2*k]   = ctr_j;
635           outdat_j[2*k-1] = i;
636         }
637       }
638       isz[i] = isz_i;
639     }
640   }
641 
642   /*  Now  post the sends */
643   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
644   for (i=0; i<nrqs; ++i) {
645     j    = pa[i];
646     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
647   }
648 
649   /* No longer need the original indices */
650   for (i=0; i<imax; ++i) {
651     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
652   }
653   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
654 
655   for (i=0; i<imax; ++i) {
656     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
657   }
658 
659   /* Do Local work */
660 #if defined(PETSC_USE_CTABLE)
661   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
662 #else
663   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
664 #endif
665 
666   /* Receive messages */
667   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
668   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
669 
670   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
671   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
672 
673   /* Phase 1 sends are complete - deallocate buffers */
674   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
675   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
676 
677   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
678   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
679   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
680   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
681   ierr = PetscFree(rbuf);CHKERRQ(ierr);
682 
683 
684   /* Send the data back */
685   /* Do a global reduction to know the buffer space req for incoming messages */
686   {
687     PetscMPIInt *rw1;
688 
689     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
690 
691     for (i=0; i<nrqr; ++i) {
692       proc = recv_status[i].MPI_SOURCE;
693 
694       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
695       rw1[proc] = isz1[i];
696     }
697     ierr = PetscFree(onodes1);CHKERRQ(ierr);
698     ierr = PetscFree(olengths1);CHKERRQ(ierr);
699 
700     /* Determine the number of messages to expect, their lengths, from from-ids */
701     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
702     ierr = PetscFree(rw1);CHKERRQ(ierr);
703   }
704   /* Now post the Irecvs corresponding to these messages */
705   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
706 
707   /* Now  post the sends */
708   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
709   for (i=0; i<nrqr; ++i) {
710     j    = recv_status[i].MPI_SOURCE;
711     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
712   }
713 
714   /* receive work done on other processors */
715   {
716     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
717     PetscMPIInt idex;
718     PetscBT     table_i;
719     MPI_Status  *status2;
720 
721     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
722     for (i=0; i<nrqs; ++i) {
723       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
724       /* Process the message */
725       rbuf2_i = rbuf2[idex];
726       ct1     = 2*rbuf2_i[0]+1;
727       jmax    = rbuf2[idex][0];
728       for (j=1; j<=jmax; j++) {
729         max     = rbuf2_i[2*j];
730         is_no   = rbuf2_i[2*j-1];
731         isz_i   = isz[is_no];
732         table_i = table[is_no];
733 #if defined(PETSC_USE_CTABLE)
734         table_data_i = table_data[is_no];
735 #else
736         data_i  = data[is_no];
737 #endif
738         for (k=0; k<max; k++,ct1++) {
739           row = rbuf2_i[ct1];
740           if (!PetscBTLookupSet(table_i,row)) {
741 #if defined(PETSC_USE_CTABLE)
742             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
743 #else
744             data_i[isz_i] = row;
745 #endif
746             isz_i++;
747           }
748         }
749         isz[is_no] = isz_i;
750       }
751     }
752 
753     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
754     ierr = PetscFree(status2);CHKERRQ(ierr);
755   }
756 
757 #if defined(PETSC_USE_CTABLE)
758   tcount_max = 0;
759   for (i=0; i<imax; ++i) {
760     table_data_i = table_data[i];
761     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
762     if (tcount_max < tcount) tcount_max = tcount;
763   }
764   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
765 #endif
766 
767   for (i=0; i<imax; ++i) {
768 #if defined(PETSC_USE_CTABLE)
769     PetscTablePosition tpos;
770     table_data_i = table_data[i];
771 
772     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
773     while (tpos) {
774       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
775       tdata[--j] = --k;
776     }
777     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
778 #else
779     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
780 #endif
781   }
782 
783   ierr = PetscFree(onodes2);CHKERRQ(ierr);
784   ierr = PetscFree(olengths2);CHKERRQ(ierr);
785 
786   ierr = PetscFree(pa);CHKERRQ(ierr);
787   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
788   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
789   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
790   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
791   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
792   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
793   ierr = PetscFree(s_status);CHKERRQ(ierr);
794   ierr = PetscFree(recv_status);CHKERRQ(ierr);
795   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
796   ierr = PetscFree(xdata);CHKERRQ(ierr);
797   ierr = PetscFree(isz1);CHKERRQ(ierr);
798 #if defined(PETSC_USE_CTABLE)
799   for (i=0; i<imax; i++) {
800     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
801   }
802   ierr = PetscFree(table_data);CHKERRQ(ierr);
803   ierr = PetscFree(tdata);CHKERRQ(ierr);
804   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
805 #else
806   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
807 #endif
808   PetscFunctionReturn(0);
809 }
810 
811 #undef __FUNCT__
812 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
813 /*
814    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
815        the work on the local processor.
816 
817      Inputs:
818       C      - MAT_MPIAIJ;
819       imax - total no of index sets processed at a time;
820       table  - an array of char - size = m bits.
821 
822      Output:
823       isz    - array containing the count of the solution elements corresponding
824                to each index set;
825       data or table_data  - pointer to the solutions
826 */
827 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
828 {
829   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
830   Mat        A  = c->A,B = c->B;
831   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
832   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
833   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
834   PetscBT    table_i;
835 #if defined(PETSC_USE_CTABLE)
836   PetscTable         table_data_i;
837   PetscErrorCode     ierr;
838   PetscTablePosition tpos;
839   PetscInt           tcount,*tdata;
840 #else
841   PetscInt           *data_i;
842 #endif
843 
844   PetscFunctionBegin;
845   rstart = C->rmap->rstart;
846   cstart = C->cmap->rstart;
847   ai     = a->i;
848   aj     = a->j;
849   bi     = b->i;
850   bj     = b->j;
851   garray = c->garray;
852 
853   for (i=0; i<imax; i++) {
854 #if defined(PETSC_USE_CTABLE)
855     /* copy existing entries of table_data_i into tdata[] */
856     table_data_i = table_data[i];
857     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
858     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
859 
860     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
861     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
862     while (tpos) {
863       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
864       tdata[--j] = --row;
865       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
866     }
867 #else
868     data_i  = data[i];
869 #endif
870     table_i = table[i];
871     isz_i   = isz[i];
872     max     = isz[i];
873 
874     for (j=0; j<max; j++) {
875 #if defined(PETSC_USE_CTABLE)
876       row   = tdata[j] - rstart;
877 #else
878       row   = data_i[j] - rstart;
879 #endif
880       start = ai[row];
881       end   = ai[row+1];
882       for (k=start; k<end; k++) { /* Amat */
883         val = aj[k] + cstart;
884         if (!PetscBTLookupSet(table_i,val)) {
885 #if defined(PETSC_USE_CTABLE)
886           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
887 #else
888           data_i[isz_i] = val;
889 #endif
890           isz_i++;
891         }
892       }
893       start = bi[row];
894       end   = bi[row+1];
895       for (k=start; k<end; k++) { /* Bmat */
896         val = garray[bj[k]];
897         if (!PetscBTLookupSet(table_i,val)) {
898 #if defined(PETSC_USE_CTABLE)
899           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
900 #else
901           data_i[isz_i] = val;
902 #endif
903           isz_i++;
904         }
905       }
906     }
907     isz[i] = isz_i;
908 
909 #if defined(PETSC_USE_CTABLE)
910     ierr = PetscFree(tdata);CHKERRQ(ierr);
911 #endif
912   }
913   PetscFunctionReturn(0);
914 }
915 
916 #undef __FUNCT__
917 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
918 /*
919       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
920          and return the output
921 
922          Input:
923            C    - the matrix
924            nrqr - no of messages being processed.
925            rbuf - an array of pointers to the recieved requests
926 
927          Output:
928            xdata - array of messages to be sent back
929            isz1  - size of each message
930 
931   For better efficiency perhaps we should malloc separately each xdata[i],
932 then if a remalloc is required we need only copy the data for that one row
933 rather then all previous rows as it is now where a single large chunck of
934 memory is used.
935 
936 */
937 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
938 {
939   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
940   Mat            A  = c->A,B = c->B;
941   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
942   PetscErrorCode ierr;
943   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
944   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
945   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
946   PetscInt       *rbuf_i,kmax,rbuf_0;
947   PetscBT        xtable;
948 
949   PetscFunctionBegin;
950   m      = C->rmap->N;
951   rstart = C->rmap->rstart;
952   cstart = C->cmap->rstart;
953   ai     = a->i;
954   aj     = a->j;
955   bi     = b->i;
956   bj     = b->j;
957   garray = c->garray;
958 
959 
960   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
961     rbuf_i =  rbuf[i];
962     rbuf_0 =  rbuf_i[0];
963     ct    += rbuf_0;
964     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
965   }
966 
967   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
968   else max1 = 1;
969   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
970   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
971   ++no_malloc;
972   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
973   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
974 
975   ct3 = 0;
976   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
977     rbuf_i =  rbuf[i];
978     rbuf_0 =  rbuf_i[0];
979     ct1    =  2*rbuf_0+1;
980     ct2    =  ct1;
981     ct3   += ct1;
982     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
983       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
984       oct2 = ct2;
985       kmax = rbuf_i[2*j];
986       for (k=0; k<kmax; k++,ct1++) {
987         row = rbuf_i[ct1];
988         if (!PetscBTLookupSet(xtable,row)) {
989           if (!(ct3 < mem_estimate)) {
990             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
991             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
992             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
993             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
994             xdata[0]     = tmp;
995             mem_estimate = new_estimate; ++no_malloc;
996             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
997           }
998           xdata[i][ct2++] = row;
999           ct3++;
1000         }
1001       }
1002       for (k=oct2,max2=ct2; k<max2; k++) {
1003         row   = xdata[i][k] - rstart;
1004         start = ai[row];
1005         end   = ai[row+1];
1006         for (l=start; l<end; l++) {
1007           val = aj[l] + cstart;
1008           if (!PetscBTLookupSet(xtable,val)) {
1009             if (!(ct3 < mem_estimate)) {
1010               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1012               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1013               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1014               xdata[0]     = tmp;
1015               mem_estimate = new_estimate; ++no_malloc;
1016               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017             }
1018             xdata[i][ct2++] = val;
1019             ct3++;
1020           }
1021         }
1022         start = bi[row];
1023         end   = bi[row+1];
1024         for (l=start; l<end; l++) {
1025           val = garray[bj[l]];
1026           if (!PetscBTLookupSet(xtable,val)) {
1027             if (!(ct3 < mem_estimate)) {
1028               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1029               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1030               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
1031               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1032               xdata[0]     = tmp;
1033               mem_estimate = new_estimate; ++no_malloc;
1034               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1035             }
1036             xdata[i][ct2++] = val;
1037             ct3++;
1038           }
1039         }
1040       }
1041       /* Update the header*/
1042       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1043       xdata[i][2*j-1] = rbuf_i[2*j-1];
1044     }
1045     xdata[i][0] = rbuf_0;
1046     xdata[i+1]  = xdata[i] + ct2;
1047     isz1[i]     = ct2; /* size of each message */
1048   }
1049   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1050   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1051   PetscFunctionReturn(0);
1052 }
1053 /* -------------------------------------------------------------------------*/
1054 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
1055 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
1056 /*
1057     Every processor gets the entire matrix
1058 */
1059 #undef __FUNCT__
1060 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
1061 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1062 {
1063   Mat            B;
1064   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1065   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1066   PetscErrorCode ierr;
1067   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1068   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1069   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1070   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1071 
1072   PetscFunctionBegin;
1073   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1074   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
1075 
1076   if (scall == MAT_INITIAL_MATRIX) {
1077     /* ----------------------------------------------------------------
1078          Tell every processor the number of nonzeros per row
1079     */
1080     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1081     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1082       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1083     }
1084     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1085     for (i=0; i<size; i++) {
1086       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1087       displs[i]     = A->rmap->range[i];
1088     }
1089 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1090     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1091 #else
1092     sendcount = A->rmap->rend - A->rmap->rstart;
1093     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1094 #endif
1095     /* ---------------------------------------------------------------
1096          Create the sequential matrix of the same type as the local block diagonal
1097     */
1098     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1099     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1100     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1101     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1102     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1103     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
1104     **Bin = B;
1105     b     = (Mat_SeqAIJ*)B->data;
1106 
1107     /*--------------------------------------------------------------------
1108        Copy my part of matrix column indices over
1109     */
1110     sendcount  = ad->nz + bd->nz;
1111     jsendbuf   = b->j + b->i[rstarts[rank]];
1112     a_jsendbuf = ad->j;
1113     b_jsendbuf = bd->j;
1114     n          = A->rmap->rend - A->rmap->rstart;
1115     cnt        = 0;
1116     for (i=0; i<n; i++) {
1117 
1118       /* put in lower diagonal portion */
1119       m = bd->i[i+1] - bd->i[i];
1120       while (m > 0) {
1121         /* is it above diagonal (in bd (compressed) numbering) */
1122         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1123         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1124         m--;
1125       }
1126 
1127       /* put in diagonal portion */
1128       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1129         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1130       }
1131 
1132       /* put in upper diagonal portion */
1133       while (m-- > 0) {
1134         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1135       }
1136     }
1137     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1138 
1139     /*--------------------------------------------------------------------
1140        Gather all column indices to all processors
1141     */
1142     for (i=0; i<size; i++) {
1143       recvcounts[i] = 0;
1144       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1145         recvcounts[i] += lens[j];
1146       }
1147     }
1148     displs[0] = 0;
1149     for (i=1; i<size; i++) {
1150       displs[i] = displs[i-1] + recvcounts[i-1];
1151     }
1152 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1153     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1154 #else
1155     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1156 #endif
1157     /*--------------------------------------------------------------------
1158         Assemble the matrix into useable form (note numerical values not yet set)
1159     */
1160     /* set the b->ilen (length of each row) values */
1161     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1162     /* set the b->i indices */
1163     b->i[0] = 0;
1164     for (i=1; i<=A->rmap->N; i++) {
1165       b->i[i] = b->i[i-1] + lens[i-1];
1166     }
1167     ierr = PetscFree(lens);CHKERRQ(ierr);
1168     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1169     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1170 
1171   } else {
1172     B = **Bin;
1173     b = (Mat_SeqAIJ*)B->data;
1174   }
1175 
1176   /*--------------------------------------------------------------------
1177        Copy my part of matrix numerical values into the values location
1178   */
1179   if (flag == MAT_GET_VALUES) {
1180     sendcount = ad->nz + bd->nz;
1181     sendbuf   = b->a + b->i[rstarts[rank]];
1182     a_sendbuf = ad->a;
1183     b_sendbuf = bd->a;
1184     b_sendj   = bd->j;
1185     n         = A->rmap->rend - A->rmap->rstart;
1186     cnt       = 0;
1187     for (i=0; i<n; i++) {
1188 
1189       /* put in lower diagonal portion */
1190       m = bd->i[i+1] - bd->i[i];
1191       while (m > 0) {
1192         /* is it above diagonal (in bd (compressed) numbering) */
1193         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1194         sendbuf[cnt++] = *b_sendbuf++;
1195         m--;
1196         b_sendj++;
1197       }
1198 
1199       /* put in diagonal portion */
1200       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1201         sendbuf[cnt++] = *a_sendbuf++;
1202       }
1203 
1204       /* put in upper diagonal portion */
1205       while (m-- > 0) {
1206         sendbuf[cnt++] = *b_sendbuf++;
1207         b_sendj++;
1208       }
1209     }
1210     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1211 
1212     /* -----------------------------------------------------------------
1213        Gather all numerical values to all processors
1214     */
1215     if (!recvcounts) {
1216       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1217     }
1218     for (i=0; i<size; i++) {
1219       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1220     }
1221     displs[0] = 0;
1222     for (i=1; i<size; i++) {
1223       displs[i] = displs[i-1] + recvcounts[i-1];
1224     }
1225     recvbuf = b->a;
1226 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1227     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1228 #else
1229     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
1230 #endif
1231   }  /* endof (flag == MAT_GET_VALUES) */
1232   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1233 
1234   if (A->symmetric) {
1235     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1236   } else if (A->hermitian) {
1237     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1238   } else if (A->structurally_symmetric) {
1239     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1240   }
1241   PetscFunctionReturn(0);
1242 }
1243 #undef __FUNCT__
1244 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices"
1245 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C)
1246 {
1247   PetscErrorCode ierr;
1248   Mat_SeqAIJ     *c = (Mat_SeqAIJ*)C->data;
1249   Mat_SubMat     *submatj = c->submatis1;
1250   PetscInt       i;
1251 
1252   PetscFunctionBegin;
1253   ierr = submatj->destroy(C);CHKERRQ(ierr);
1254 
1255   ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr);
1256 
1257   for (i=0; i<submatj->nrqr; ++i) {
1258     ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr);
1259   }
1260   ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr);
1261 
1262   ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr);
1263   ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr);
1264 
1265   for (i=0; i<submatj->nrqs; ++i) {
1266     ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr);
1267   }
1268   ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr);
1269   ierr = PetscFree(submatj->pa);CHKERRQ(ierr);
1270 
1271   ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr);
1272 
1273 #if defined(PETSC_USE_CTABLE)
1274   ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr);
1275   if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);}
1276   ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr);
1277 #else
1278   ierr = PetscFree(submatj->rmap);CHKERRQ(ierr);
1279 #endif
1280 
1281   if (!submatj->allcolumns) {
1282 #if defined(PETSC_USE_CTABLE)
1283     ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr);
1284 #else
1285     ierr = PetscFree(submatj->cmap);CHKERRQ(ierr);
1286 #endif
1287   }
1288 
1289   ierr = PetscFree(submatj);CHKERRQ(ierr);
1290   PetscFunctionReturn(0);
1291 }
1292 
1293 #undef __FUNCT__
1294 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local"
1295 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1296 {
1297   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1298   Mat            submat,A = c->A,B = c->B;
1299   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1300   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1301   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1302   const PetscInt *icol,*irow;
1303   PetscInt       nrow,ncol,start;
1304   PetscErrorCode ierr;
1305   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1306   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1307   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1308   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1309   PetscInt       *lens,rmax,ncols,*cols,Crow;
1310 #if defined(PETSC_USE_CTABLE)
1311   PetscTable     cmap,rmap;
1312   PetscInt       *cmap_loc,*rmap_loc;
1313 #else
1314   PetscInt       *cmap,*rmap;
1315 #endif
1316   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1317   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1318   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1319   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1320   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1321   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1322   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1323   MPI_Comm       comm;
1324   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1325   PetscMPIInt    *onodes1,*olengths1,idex,end;
1326   Mat_SubMat     *smatis1;
1327   PetscBool      isrowsorted;
1328 
1329   PetscFunctionBegin;
1330   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1331 
1332   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1333   size = c->size;
1334   rank = c->rank;
1335 
1336   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1337   if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted");
1338 
1339   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1340   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1341   if (allcolumns) {
1342     icol = NULL;
1343     ncol = C->cmap->N;
1344   } else {
1345     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1346     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1347   }
1348 
1349   if (scall == MAT_INITIAL_MATRIX) {
1350     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1351 
1352     /* Get some new tags to keep the communication clean */
1353     tag1 = ((PetscObject)C)->tag;
1354     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1355     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1356 
1357     /* evaluate communication - mesg to who, length of mesg, and buffer space
1358      required. Based on this, buffers are allocated, and data copied into them */
1359     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1360     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1361 
1362     /* w1[proc] = num of rows owned by proc -- to be requested */
1363     proc = 0;
1364     nrqs = 0; /* num of outgoing messages */
1365     for (j=0; j<nrow; j++) {
1366       row  = irow[j]; /* sorted! */
1367       while (row >= C->rmap->range[proc+1]) proc++;
1368       w1[proc]++;
1369       row2proc[j] = proc; /* map row index to proc */
1370 
1371       if (proc != rank && !w2[proc]) {
1372         w2[proc] = 1; nrqs++;
1373       }
1374     }
1375     w1[rank] = 0;  /* rows owned by self will not be requested */
1376 
1377     ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
1378     for (proc=0,j=0; proc<size; proc++) {
1379       if (w1[proc]) { pa[j++] = proc;}
1380     }
1381 
1382     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1383     msz = 0;              /* total mesg length (for all procs) */
1384     for (i=0; i<nrqs; i++) {
1385       proc      = pa[i];
1386       w1[proc] += 3;
1387       msz      += w1[proc];
1388     }
1389     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1390 
1391     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1392     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1393     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1394 
1395     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1396        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1397     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1398 
1399     /* Now post the Irecvs corresponding to these messages */
1400     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1401 
1402     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1403     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1404 
1405     /* Allocate Memory for outgoing messages */
1406     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1407     ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
1408     ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
1409 
1410     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1411     iptr = tmp;
1412     for (i=0; i<nrqs; i++) {
1413       proc        = pa[i];
1414       sbuf1[proc] = iptr;
1415       iptr       += w1[proc];
1416     }
1417 
1418     /* Form the outgoing messages */
1419     /* Initialize the header space */
1420     for (i=0; i<nrqs; i++) {
1421       proc      = pa[i];
1422       ierr      = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr);
1423       ptr[proc] = sbuf1[proc] + 3;
1424     }
1425 
1426     /* Parse the isrow and copy data into outbuf */
1427     ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
1428     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1429       proc = row2proc[j];
1430       if (proc != rank) { /* copy to the outgoing buf*/
1431         *ptr[proc] = irow[j];
1432         ctr[proc]++; ptr[proc]++;
1433       }
1434     }
1435 
1436     /* Update the headers for the current IS */
1437     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1438       if ((ctr_j = ctr[j])) {
1439         sbuf1_j        = sbuf1[j];
1440         k              = ++sbuf1_j[0];
1441         sbuf1_j[2*k]   = ctr_j;
1442         sbuf1_j[2*k-1] = 0;
1443       }
1444     }
1445 
1446     /* Now post the sends */
1447     ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
1448     for (i=0; i<nrqs; ++i) {
1449       proc = pa[i];
1450       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr);
1451     }
1452 
1453     /* Post Receives to capture the buffer size */
1454     ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr);
1455     ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
1456 
1457     rbuf2[0] = tmp + msz;
1458     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1459 
1460     for (i=0; i<nrqs; ++i) {
1461       proc = pa[i];
1462       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr);
1463     }
1464 
1465     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1466 
1467     /* Send to other procs the buf size they should allocate */
1468     /* Receive messages*/
1469     ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
1470     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1471 
1472     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
1473     for (i=0; i<nrqr; ++i) {
1474       req_size[i] = 0;
1475       rbuf1_i        = rbuf1[i];
1476       start          = 2*rbuf1_i[0] + 1;
1477       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1478       ierr           = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
1479       sbuf2_i        = sbuf2[i];
1480       for (j=start; j<end; j++) {
1481         k            = rbuf1_i[j] - rstart;
1482         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1483         sbuf2_i[j]   = ncols;
1484         req_size[i] += ncols;
1485       }
1486       req_source1[i] = r_status1[i].MPI_SOURCE;
1487 
1488       /* form the header */
1489       sbuf2_i[0] = req_size[i];
1490       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1491 
1492       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
1493     }
1494 
1495     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1496     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1497 
1498     /* rbuf2 is received, Post recv column indices a->j */
1499     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
1500 
1501     ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
1502     for (i=0; i<nrqs; ++i) {
1503       ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
1504       req_source2[i] = r_status2[i].MPI_SOURCE;
1505       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
1506     }
1507 
1508     /* Wait on sends1 and sends2 */
1509     ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1510     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1511     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1512     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1513 
1514     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1515     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1516 
1517     /* Now allocate sending buffers for a->j, and send them off */
1518     ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1519     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1520     ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1521     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1522 
1523     for (i=0; i<nrqr; i++) { /* for each requested message */
1524       rbuf1_i   = rbuf1[i];
1525       sbuf_aj_i = sbuf_aj[i];
1526       ct1       = 2*rbuf1_i[0] + 1;
1527       ct2       = 0;
1528       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1529 
1530       kmax = rbuf1[i][2];
1531       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1532         row    = rbuf1_i[ct1] - rstart;
1533         nzA    = ai[row+1] - ai[row];
1534         nzB    = bi[row+1] - bi[row];
1535         ncols  = nzA + nzB;
1536         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1537 
1538         /* load the column indices for this row into cols*/
1539         cols = sbuf_aj_i + ct2;
1540 
1541         lwrite = 0;
1542         for (l=0; l<nzB; l++) {
1543           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1544         }
1545         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1546         for (l=0; l<nzB; l++) {
1547           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1548         }
1549 
1550         ct2 += ncols;
1551       }
1552       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
1553     }
1554 
1555     /* create column map (cmap): global col of C -> local col of submat */
1556 #if defined(PETSC_USE_CTABLE)
1557     if (!allcolumns) {
1558       ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr);
1559       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1560       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1561         if (icol[j] >= cstart && icol[j] <cend) {
1562           cmap_loc[icol[j] - cstart] = j+1;
1563         } else { /* use PetscTable for non-local col indices */
1564           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1565         }
1566       }
1567     } else {
1568       cmap     = NULL;
1569       cmap_loc = NULL;
1570     }
1571     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1572 #else
1573     if (!allcolumns) {
1574       ierr   = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1575       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1576     } else {
1577       cmap = NULL;
1578     }
1579 #endif
1580 
1581     /* Create lens for MatSeqAIJSetPreallocation() */
1582     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1583 
1584     /* Compute lens from local part of C */
1585     for (j=0; j<nrow; j++) {
1586       row  = irow[j];
1587       proc = row2proc[j];
1588       if (proc == rank) {
1589         /* diagonal part A = c->A */
1590         ncols = ai[row-rstart+1] - ai[row-rstart];
1591         cols  = aj + ai[row-rstart];
1592         if (!allcolumns) {
1593           for (k=0; k<ncols; k++) {
1594 #if defined(PETSC_USE_CTABLE)
1595             tcol = cmap_loc[cols[k]];
1596 #else
1597             tcol = cmap[cols[k]+cstart];
1598 #endif
1599             if (tcol) lens[j]++;
1600           }
1601         } else { /* allcolumns */
1602           lens[j] = ncols;
1603         }
1604 
1605         /* off-diagonal part B = c->B */
1606         ncols = bi[row-rstart+1] - bi[row-rstart];
1607         cols  = bj + bi[row-rstart];
1608         if (!allcolumns) {
1609           for (k=0; k<ncols; k++) {
1610 #if defined(PETSC_USE_CTABLE)
1611             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1612 #else
1613             tcol = cmap[bmap[cols[k]]];
1614 #endif
1615             if (tcol) lens[j]++;
1616           }
1617         } else { /* allcolumns */
1618           lens[j] += ncols;
1619         }
1620       }
1621     }
1622 
1623     /* Create row map (rmap): global row of C -> local row of submat */
1624 #if defined(PETSC_USE_CTABLE)
1625     ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr);
1626     for (j=0; j<nrow; j++) {
1627       row  = irow[j];
1628       proc = row2proc[j];
1629       if (proc == rank) { /* a local row */
1630         rmap_loc[row - rstart] = j;
1631       } else {
1632         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1633       }
1634     }
1635 #else
1636     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1637     for (j=0; j<nrow; j++) {
1638       rmap[irow[j]] = j;
1639     }
1640 #endif
1641 
1642     /* Update lens from offproc data */
1643     /* recv a->j is done */
1644     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1645     for (i=0; i<nrqs; i++) {
1646       proc    = pa[i];
1647       sbuf1_i = sbuf1[proc];
1648       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1649       ct1     = 2 + 1;
1650       ct2     = 0;
1651       rbuf2_i = rbuf2[i]; /* received length of C->j */
1652       rbuf3_i = rbuf3[i]; /* received C->j */
1653 
1654       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1655       max1   = sbuf1_i[2];
1656       for (k=0; k<max1; k++,ct1++) {
1657 #if defined(PETSC_USE_CTABLE)
1658         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1659         row--;
1660         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1661 #else
1662         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1663 #endif
1664         /* Now, store row index of submat in sbuf1_i[ct1] */
1665         sbuf1_i[ct1] = row;
1666 
1667         nnz = rbuf2_i[ct1];
1668         if (!allcolumns) {
1669           for (l=0; l<nnz; l++,ct2++) {
1670 #if defined(PETSC_USE_CTABLE)
1671             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1672               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1673             } else {
1674               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1675             }
1676 #else
1677             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1678 #endif
1679             if (tcol) lens[row]++;
1680           }
1681         } else { /* allcolumns */
1682           lens[row] += nnz;
1683         }
1684       }
1685     }
1686     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1687     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1688 
1689     /* Create the submatrices */
1690     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1691     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1692 
1693     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1694     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1695     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1696     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1697     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1698 
1699     /* create struct Mat_SubMat and attached it to submat */
1700     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1701     subc = (Mat_SeqAIJ*)submat->data;
1702     subc->submatis1 = smatis1;
1703 
1704     smatis1->nrqs        = nrqs;
1705     smatis1->nrqr        = nrqr;
1706     smatis1->rbuf1       = rbuf1;
1707     smatis1->rbuf2       = rbuf2;
1708     smatis1->rbuf3       = rbuf3;
1709     smatis1->sbuf2       = sbuf2;
1710     smatis1->req_source2 = req_source2;
1711 
1712     smatis1->sbuf1       = sbuf1;
1713     smatis1->ptr         = ptr;
1714     smatis1->tmp         = tmp;
1715     smatis1->ctr         = ctr;
1716 
1717     smatis1->pa           = pa;
1718     smatis1->req_size     = req_size;
1719     smatis1->req_source1  = req_source1;
1720 
1721     smatis1->allcolumns  = allcolumns;
1722     smatis1->row2proc    = row2proc;
1723     smatis1->rmap        = rmap;
1724     smatis1->cmap        = cmap;
1725 #if defined(PETSC_USE_CTABLE)
1726     smatis1->rmap_loc    = rmap_loc;
1727     smatis1->cmap_loc    = cmap_loc;
1728 #endif
1729 
1730     smatis1->destroy     = submat->ops->destroy;
1731     submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices;
1732     submat->factortype   = C->factortype;
1733 
1734     /* compute rmax */
1735     rmax = 0;
1736     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1737 
1738   } else { /* scall == MAT_REUSE_MATRIX */
1739     submat = submats[0];
1740     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1741 
1742     subc    = (Mat_SeqAIJ*)submat->data;
1743     rmax    = subc->rmax;
1744     smatis1 = subc->submatis1;
1745     nrqs        = smatis1->nrqs;
1746     nrqr        = smatis1->nrqr;
1747     rbuf1       = smatis1->rbuf1;
1748     rbuf2       = smatis1->rbuf2;
1749     rbuf3       = smatis1->rbuf3;
1750     req_source2 = smatis1->req_source2;
1751 
1752     sbuf1     = smatis1->sbuf1;
1753     sbuf2     = smatis1->sbuf2;
1754     ptr       = smatis1->ptr;
1755     tmp       = smatis1->tmp;
1756     ctr       = smatis1->ctr;
1757 
1758     pa         = smatis1->pa;
1759     req_size   = smatis1->req_size;
1760     req_source1 = smatis1->req_source1;
1761 
1762     allcolumns = smatis1->allcolumns;
1763     row2proc   = smatis1->row2proc;
1764     rmap       = smatis1->rmap;
1765     cmap       = smatis1->cmap;
1766 #if defined(PETSC_USE_CTABLE)
1767     rmap_loc   = smatis1->rmap_loc;
1768     cmap_loc   = smatis1->cmap_loc;
1769 #endif
1770   }
1771 
1772   /* Post recv matrix values */
1773   ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1774   ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr);
1775   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1776   for (i=0; i<nrqs; ++i) {
1777     ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr);
1778     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1779   }
1780 
1781   /* Allocate sending buffers for a->a, and send them off */
1782   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1783   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1784   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1785   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1786 
1787   for (i=0; i<nrqr; i++) {
1788     rbuf1_i   = rbuf1[i];
1789     sbuf_aa_i = sbuf_aa[i];
1790     ct1       = 2*rbuf1_i[0]+1;
1791     ct2       = 0;
1792     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1793 
1794     kmax = rbuf1_i[2];
1795     for (k=0; k<kmax; k++,ct1++) {
1796       row = rbuf1_i[ct1] - rstart;
1797       nzA = ai[row+1] - ai[row];
1798       nzB = bi[row+1] - bi[row];
1799       ncols  = nzA + nzB;
1800       cworkB = bj + bi[row];
1801       vworkA = a_a + ai[row];
1802       vworkB = b_a + bi[row];
1803 
1804       /* load the column values for this row into vals*/
1805       vals = sbuf_aa_i + ct2;
1806 
1807       lwrite = 0;
1808       for (l=0; l<nzB; l++) {
1809         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1810       }
1811       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1812       for (l=0; l<nzB; l++) {
1813         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1814       }
1815 
1816       ct2 += ncols;
1817     }
1818     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1819   }
1820 
1821   /* Assemble submat */
1822   /* First assemble the local rows */
1823   for (j=0; j<nrow; j++) {
1824     row  = irow[j];
1825     proc = row2proc[j];
1826     if (proc == rank) {
1827       Crow = row - rstart;  /* local row index of C */
1828 #if defined(PETSC_USE_CTABLE)
1829       row = rmap_loc[Crow]; /* row index of submat */
1830 #else
1831       row = rmap[row];
1832 #endif
1833 
1834       if (allcolumns) {
1835         /* diagonal part A = c->A */
1836         ncols = ai[Crow+1] - ai[Crow];
1837         cols  = aj + ai[Crow];
1838         vals  = a->a + ai[Crow];
1839         i     = 0;
1840         for (k=0; k<ncols; k++) {
1841           subcols[i]   = cols[k] + cstart;
1842           subvals[i++] = vals[k];
1843         }
1844 
1845         /* off-diagonal part B = c->B */
1846         ncols = bi[Crow+1] - bi[Crow];
1847         cols  = bj + bi[Crow];
1848         vals  = b->a + bi[Crow];
1849         for (k=0; k<ncols; k++) {
1850           subcols[i]   = bmap[cols[k]];
1851           subvals[i++] = vals[k];
1852         }
1853 
1854         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1855 
1856       } else { /* !allcolumns */
1857 #if defined(PETSC_USE_CTABLE)
1858         /* diagonal part A = c->A */
1859         ncols = ai[Crow+1] - ai[Crow];
1860         cols  = aj + ai[Crow];
1861         vals  = a->a + ai[Crow];
1862         i     = 0;
1863         for (k=0; k<ncols; k++) {
1864           tcol = cmap_loc[cols[k]];
1865           if (tcol) {
1866             subcols[i]   = --tcol;
1867             subvals[i++] = vals[k];
1868           }
1869         }
1870 
1871         /* off-diagonal part B = c->B */
1872         ncols = bi[Crow+1] - bi[Crow];
1873         cols  = bj + bi[Crow];
1874         vals  = b->a + bi[Crow];
1875         for (k=0; k<ncols; k++) {
1876           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1877           if (tcol) {
1878             subcols[i]   = --tcol;
1879             subvals[i++] = vals[k];
1880           }
1881         }
1882 #else
1883         /* diagonal part A = c->A */
1884         ncols = ai[Crow+1] - ai[Crow];
1885         cols  = aj + ai[Crow];
1886         vals  = a->a + ai[Crow];
1887         i     = 0;
1888         for (k=0; k<ncols; k++) {
1889           tcol = cmap[cols[k]+cstart];
1890           if (tcol) {
1891             subcols[i]   = --tcol;
1892             subvals[i++] = vals[k];
1893           }
1894         }
1895 
1896         /* off-diagonal part B = c->B */
1897         ncols = bi[Crow+1] - bi[Crow];
1898         cols  = bj + bi[Crow];
1899         vals  = b->a + bi[Crow];
1900         for (k=0; k<ncols; k++) {
1901           tcol = cmap[bmap[cols[k]]];
1902           if (tcol) {
1903             subcols[i]   = --tcol;
1904             subvals[i++] = vals[k];
1905           }
1906         }
1907 #endif
1908         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1909       }
1910     }
1911   }
1912 
1913   /* Now assemble the off-proc rows */
1914   for (i=0; i<nrqs; i++) { /* for each requested message */
1915     /* recv values from other processes */
1916     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr);
1917     proc    = pa[idex];
1918     sbuf1_i = sbuf1[proc];
1919     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1920     ct1     = 2 + 1;
1921     ct2     = 0; /* count of received C->j */
1922     ct3     = 0; /* count of received C->j that will be inserted into submat */
1923     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1924     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1925     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1926 
1927     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1928     max1 = sbuf1_i[2];             /* num of rows */
1929     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1930       row = sbuf1_i[ct1]; /* row index of submat */
1931       if (!allcolumns) {
1932         idex = 0;
1933         if (scall == MAT_INITIAL_MATRIX) {
1934           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1935           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1936 #if defined(PETSC_USE_CTABLE)
1937             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1938               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1939             } else {
1940               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1941             }
1942 #else
1943             tcol = cmap[rbuf3_i[ct2]];
1944 #endif
1945             if (tcol) {
1946               subcols[idex]   = --tcol;
1947               subvals[idex++] = rbuf4_i[ct2];
1948 
1949               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1950                For reuse, we replace received C->j with index that should be inserted to submat */
1951               rbuf3_i[ct3++] = ct2;
1952             }
1953           }
1954           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1955 
1956         } else { /* scall == MAT_REUSE_MATRIX */
1957           submat = submats[0];
1958           subc   = (Mat_SeqAIJ*)submat->data;
1959 
1960           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1961           for (l=0; l<nnz; l++) {
1962             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1963             subvals[idex++] = rbuf4_i[ct2];
1964           }
1965 
1966           bj = subc->j + subc->i[row];
1967           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1968         }
1969       } else { /* allcolumns */
1970         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1971         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1972         ct2 += nnz;
1973       }
1974     }
1975   }
1976 
1977   /* sending a->a are done */
1978   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1979   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1980 
1981   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1982   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1983   submats[0] = submat;
1984 
1985   /* Restore the indices */
1986   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1987   if (!allcolumns) {
1988     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1989   }
1990 
1991   /* Destroy allocated memory */
1992   for (i=0; i<nrqs; ++i) {
1993     ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr);
1994   }
1995   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1996   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1997   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1998 
1999   if (scall == MAT_INITIAL_MATRIX) {
2000     ierr = PetscFree(lens);CHKERRQ(ierr);
2001     ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2002     ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2003   }
2004   PetscFunctionReturn(0);
2005 }
2006 
2007 #undef __FUNCT__
2008 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS"
2009 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2010 {
2011   PetscErrorCode ierr;
2012   PetscInt       ncol;
2013   PetscBool      colflag,allcolumns=PETSC_FALSE;
2014 
2015   PetscFunctionBegin;
2016   /* Allocate memory to hold all the submatrices */
2017   if (scall != MAT_REUSE_MATRIX) {
2018     ierr = PetscMalloc1(1,submat);CHKERRQ(ierr);
2019   }
2020 
2021   /* Check for special case: each processor gets entire matrix columns */
2022   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
2023   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
2024   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
2025 
2026   ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
2027   PetscFunctionReturn(0);
2028 }
2029 
2030 #undef __FUNCT__
2031 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
2032 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2033 {
2034   PetscErrorCode ierr;
2035   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
2036   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
2037 
2038   PetscFunctionBegin;
2039   /* Check for special case: each processor gets entire matrix */
2040   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */
2041     ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2042     PetscFunctionReturn(0);
2043   }
2044 
2045   if (ismax == 1 && C->rmap->N == C->cmap->N) {
2046     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
2047     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
2048     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2049     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2050     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2051       wantallmatrix = PETSC_TRUE;
2052 
2053       ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2054     }
2055   }
2056   ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2057   if (twantallmatrix) {
2058     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2059     PetscFunctionReturn(0);
2060   }
2061 
2062   /* Allocate memory to hold all the submatrices */
2063   if (scall != MAT_REUSE_MATRIX) {
2064     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
2065   }
2066 
2067   /* Check for special case: each processor gets entire matrix columns */
2068   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
2069   for (i=0; i<ismax; i++) {
2070     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2071     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
2072     if (colflag && ncol == C->cmap->N) {
2073       allcolumns[i] = PETSC_TRUE;
2074     } else {
2075       allcolumns[i] = PETSC_FALSE;
2076     }
2077   }
2078 
2079   /* Determine the number of stages through which submatrices are done */
2080   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
2081 
2082   /*
2083      Each stage will extract nmax submatrices.
2084      nmax is determined by the matrix column dimension.
2085      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2086   */
2087   if (!nmax) nmax = 1;
2088   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
2089 
2090   /* Make sure every processor loops through the nstages */
2091   ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
2092 
2093   for (i=0,pos=0; i<nstages; i++) {
2094     if (pos+nmax <= ismax) max_no = nmax;
2095     else if (pos == ismax) max_no = 0;
2096     else                   max_no = ismax-pos;
2097     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
2098     pos += max_no;
2099   }
2100 
2101   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
2102   PetscFunctionReturn(0);
2103 }
2104 
2105 /* -------------------------------------------------------------------------*/
2106 #undef __FUNCT__
2107 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
2108 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
2109 {
2110   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2111   Mat            A  = c->A;
2112   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
2113   const PetscInt **icol,**irow;
2114   PetscInt       *nrow,*ncol,start;
2115   PetscErrorCode ierr;
2116   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
2117   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
2118   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
2119   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
2120   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2121 #if defined(PETSC_USE_CTABLE)
2122   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2123 #else
2124   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2125 #endif
2126   const PetscInt *irow_i;
2127   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2128   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2129   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2130   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2131   MPI_Status     *r_status3,*r_status4,*s_status4;
2132   MPI_Comm       comm;
2133   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
2134   PetscMPIInt    *onodes1,*olengths1;
2135   PetscMPIInt    idex,idex2,end;
2136 
2137   PetscFunctionBegin;
2138   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2139   tag0 = ((PetscObject)C)->tag;
2140   size = c->size;
2141   rank = c->rank;
2142 
2143   /* Get some new tags to keep the communication clean */
2144   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
2145   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2146   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2147 
2148   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
2149 
2150   for (i=0; i<ismax; i++) {
2151     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2152     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2153     if (allcolumns[i]) {
2154       icol[i] = NULL;
2155       ncol[i] = C->cmap->N;
2156     } else {
2157       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2158       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2159     }
2160   }
2161 
2162   /* evaluate communication - mesg to who, length of mesg, and buffer space
2163      required. Based on this, buffers are allocated, and data copied into them*/
2164   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
2165   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2166   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2167   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
2168   for (i=0; i<ismax; i++) {
2169     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
2170     jmax   = nrow[i];
2171     irow_i = irow[i];
2172     for (j=0; j<jmax; j++) {
2173       l   = 0;
2174       row = irow_i[j];
2175       while (row >= C->rmap->range[l+1]) l++;
2176       proc = l;
2177       w4[proc]++;
2178     }
2179     for (j=0; j<size; j++) {
2180       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
2181     }
2182   }
2183 
2184   nrqs     = 0;              /* no of outgoing messages */
2185   msz      = 0;              /* total mesg length (for all procs) */
2186   w1[rank] = 0;              /* no mesg sent to self */
2187   w3[rank] = 0;
2188   for (i=0; i<size; i++) {
2189     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2190   }
2191   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
2192   for (i=0,j=0; i<size; i++) {
2193     if (w1[i]) { pa[j] = i; j++; }
2194   }
2195 
2196   /* Each message would have a header = 1 + 2*(no of IS) + data */
2197   for (i=0; i<nrqs; i++) {
2198     j      = pa[i];
2199     w1[j] += w2[j] + 2* w3[j];
2200     msz   += w1[j];
2201   }
2202   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2203 
2204   /* Determine the number of messages to expect, their lengths, from from-ids */
2205   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2206   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2207 
2208   /* Now post the Irecvs corresponding to these messages */
2209   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2210 
2211   ierr = PetscFree(onodes1);CHKERRQ(ierr);
2212   ierr = PetscFree(olengths1);CHKERRQ(ierr);
2213 
2214   /* Allocate Memory for outgoing messages */
2215   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2216   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
2217   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
2218 
2219   {
2220     PetscInt *iptr = tmp;
2221     k    = 0;
2222     for (i=0; i<nrqs; i++) {
2223       j        = pa[i];
2224       iptr    += k;
2225       sbuf1[j] = iptr;
2226       k        = w1[j];
2227     }
2228   }
2229 
2230   /* Form the outgoing messages */
2231   /* Initialize the header space */
2232   for (i=0; i<nrqs; i++) {
2233     j           = pa[i];
2234     sbuf1[j][0] = 0;
2235     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
2236     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2237   }
2238 
2239   /* Parse the isrow and copy data into outbuf */
2240   for (i=0; i<ismax; i++) {
2241     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
2242     irow_i = irow[i];
2243     jmax   = nrow[i];
2244     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2245       l   = 0;
2246       row = irow_i[j];
2247       while (row >= C->rmap->range[l+1]) l++;
2248       proc = l;
2249       if (proc != rank) { /* copy to the outgoing buf*/
2250         ctr[proc]++;
2251         *ptr[proc] = row;
2252         ptr[proc]++;
2253       }
2254     }
2255     /* Update the headers for the current IS */
2256     for (j=0; j<size; j++) { /* Can Optimise this loop too */
2257       if ((ctr_j = ctr[j])) {
2258         sbuf1_j        = sbuf1[j];
2259         k              = ++sbuf1_j[0];
2260         sbuf1_j[2*k]   = ctr_j;
2261         sbuf1_j[2*k-1] = i;
2262       }
2263     }
2264   }
2265 
2266   /*  Now  post the sends */
2267   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
2268   for (i=0; i<nrqs; ++i) {
2269     j    = pa[i];
2270     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
2271   }
2272 
2273   /* Post Receives to capture the buffer size */
2274   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
2275   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
2276   rbuf2[0] = tmp + msz;
2277   for (i=1; i<nrqs; ++i) {
2278     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2279   }
2280   for (i=0; i<nrqs; ++i) {
2281     j    = pa[i];
2282     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
2283   }
2284 
2285   /* Send to other procs the buf size they should allocate */
2286 
2287 
2288   /* Receive messages*/
2289   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
2290   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
2291   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
2292   {
2293     PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2294     PetscInt   *sbuf2_i;
2295 
2296     for (i=0; i<nrqr; ++i) {
2297       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
2298 
2299       req_size[idex] = 0;
2300       rbuf1_i        = rbuf1[idex];
2301       start          = 2*rbuf1_i[0] + 1;
2302       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
2303       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
2304       sbuf2_i        = sbuf2[idex];
2305       for (j=start; j<end; j++) {
2306         id              = rbuf1_i[j] - rstart;
2307         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2308         sbuf2_i[j]      = ncols;
2309         req_size[idex] += ncols;
2310       }
2311       req_source[idex] = r_status1[i].MPI_SOURCE;
2312       /* form the header */
2313       sbuf2_i[0] = req_size[idex];
2314       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2315 
2316       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
2317     }
2318   }
2319   ierr = PetscFree(r_status1);CHKERRQ(ierr);
2320   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2321 
2322   /*  recv buffer sizes */
2323   /* Receive messages*/
2324 
2325   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
2326   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
2327   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
2328   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
2329   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
2330 
2331   for (i=0; i<nrqs; ++i) {
2332     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
2333     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
2334     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
2335     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
2336     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
2337   }
2338   ierr = PetscFree(r_status2);CHKERRQ(ierr);
2339   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2340 
2341   /* Wait on sends1 and sends2 */
2342   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
2343   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
2344 
2345   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
2346   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
2347   ierr = PetscFree(s_status1);CHKERRQ(ierr);
2348   ierr = PetscFree(s_status2);CHKERRQ(ierr);
2349   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2350   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2351 
2352   /* Now allocate buffers for a->j, and send them off */
2353   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
2354   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2355   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
2356   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2357 
2358   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
2359   {
2360     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2361     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2362     PetscInt cend = C->cmap->rend;
2363     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2364 
2365     for (i=0; i<nrqr; i++) {
2366       rbuf1_i   = rbuf1[i];
2367       sbuf_aj_i = sbuf_aj[i];
2368       ct1       = 2*rbuf1_i[0] + 1;
2369       ct2       = 0;
2370       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2371         kmax = rbuf1[i][2*j];
2372         for (k=0; k<kmax; k++,ct1++) {
2373           row    = rbuf1_i[ct1] - rstart;
2374           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2375           ncols  = nzA + nzB;
2376           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2377 
2378           /* load the column indices for this row into cols*/
2379           cols = sbuf_aj_i + ct2;
2380 
2381           lwrite = 0;
2382           for (l=0; l<nzB; l++) {
2383             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2384           }
2385           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2386           for (l=0; l<nzB; l++) {
2387             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2388           }
2389 
2390           ct2 += ncols;
2391         }
2392       }
2393       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
2394     }
2395   }
2396   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
2397   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
2398 
2399   /* Allocate buffers for a->a, and send them off */
2400   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
2401   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2402   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
2403   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2404 
2405   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
2406   {
2407     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2408     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2409     PetscInt    cend   = C->cmap->rend;
2410     PetscInt    *b_j   = b->j;
2411     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2412 
2413     for (i=0; i<nrqr; i++) {
2414       rbuf1_i   = rbuf1[i];
2415       sbuf_aa_i = sbuf_aa[i];
2416       ct1       = 2*rbuf1_i[0]+1;
2417       ct2       = 0;
2418       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2419         kmax = rbuf1_i[2*j];
2420         for (k=0; k<kmax; k++,ct1++) {
2421           row    = rbuf1_i[ct1] - rstart;
2422           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2423           ncols  = nzA + nzB;
2424           cworkB = b_j + b_i[row];
2425           vworkA = a_a + a_i[row];
2426           vworkB = b_a + b_i[row];
2427 
2428           /* load the column values for this row into vals*/
2429           vals = sbuf_aa_i+ct2;
2430 
2431           lwrite = 0;
2432           for (l=0; l<nzB; l++) {
2433             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2434           }
2435           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2436           for (l=0; l<nzB; l++) {
2437             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2438           }
2439 
2440           ct2 += ncols;
2441         }
2442       }
2443       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
2444     }
2445   }
2446   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
2447   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
2448   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
2449   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
2450 
2451   /* Form the matrix */
2452   /* create col map: global col of C -> local col of submatrices */
2453   {
2454     const PetscInt *icol_i;
2455 #if defined(PETSC_USE_CTABLE)
2456     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
2457     for (i=0; i<ismax; i++) {
2458       if (!allcolumns[i]) {
2459         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
2460 
2461         jmax   = ncol[i];
2462         icol_i = icol[i];
2463         cmap_i = cmap[i];
2464         for (j=0; j<jmax; j++) {
2465           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2466         }
2467       } else {
2468         cmap[i] = NULL;
2469       }
2470     }
2471 #else
2472     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
2473     for (i=0; i<ismax; i++) {
2474       if (!allcolumns[i]) {
2475         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2476         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2477         jmax   = ncol[i];
2478         icol_i = icol[i];
2479         cmap_i = cmap[i];
2480         for (j=0; j<jmax; j++) {
2481           cmap_i[icol_i[j]] = j+1;
2482         }
2483       } else {
2484         cmap[i] = NULL;
2485       }
2486     }
2487 #endif
2488   }
2489 
2490   /* Create lens which is required for MatCreate... */
2491   for (i=0,j=0; i<ismax; i++) j += nrow[i];
2492   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2493   if (ismax) {
2494     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
2495     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
2496   }
2497   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2498 
2499   /* Update lens from local data */
2500   for (i=0; i<ismax; i++) {
2501     jmax = nrow[i];
2502     if (!allcolumns[i]) cmap_i = cmap[i];
2503     irow_i = irow[i];
2504     lens_i = lens[i];
2505     for (j=0; j<jmax; j++) {
2506       l   = 0;
2507       row = irow_i[j];
2508       while (row >= C->rmap->range[l+1]) l++;
2509       proc = l;
2510       if (proc == rank) {
2511         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2512         if (!allcolumns[i]) {
2513           for (k=0; k<ncols; k++) {
2514 #if defined(PETSC_USE_CTABLE)
2515             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2516 #else
2517             tcol = cmap_i[cols[k]];
2518 #endif
2519             if (tcol) lens_i[j]++;
2520           }
2521         } else { /* allcolumns */
2522           lens_i[j] = ncols;
2523         }
2524         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
2525       }
2526     }
2527   }
2528 
2529   /* Create row map: global row of C -> local row of submatrices */
2530 #if defined(PETSC_USE_CTABLE)
2531   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
2532   for (i=0; i<ismax; i++) {
2533     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
2534     irow_i = irow[i];
2535     jmax   = nrow[i];
2536     for (j=0; j<jmax; j++) {
2537       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2538     }
2539   }
2540 #else
2541   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
2542   if (ismax) {
2543     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
2544     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
2545   }
2546   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
2547   for (i=0; i<ismax; i++) {
2548     rmap_i = rmap[i];
2549     irow_i = irow[i];
2550     jmax   = nrow[i];
2551     for (j=0; j<jmax; j++) {
2552       rmap_i[irow_i[j]] = j;
2553     }
2554   }
2555 #endif
2556 
2557   /* Update lens from offproc data */
2558   {
2559     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2560 
2561     for (tmp2=0; tmp2<nrqs; tmp2++) {
2562       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
2563       idex    = pa[idex2];
2564       sbuf1_i = sbuf1[idex];
2565       jmax    = sbuf1_i[0];
2566       ct1     = 2*jmax+1;
2567       ct2     = 0;
2568       rbuf2_i = rbuf2[idex2];
2569       rbuf3_i = rbuf3[idex2];
2570       for (j=1; j<=jmax; j++) {
2571         is_no  = sbuf1_i[2*j-1];
2572         max1   = sbuf1_i[2*j];
2573         lens_i = lens[is_no];
2574         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2575         rmap_i = rmap[is_no];
2576         for (k=0; k<max1; k++,ct1++) {
2577 #if defined(PETSC_USE_CTABLE)
2578           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2579           row--;
2580           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2581 #else
2582           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2583 #endif
2584           max2 = rbuf2_i[ct1];
2585           for (l=0; l<max2; l++,ct2++) {
2586             if (!allcolumns[is_no]) {
2587 #if defined(PETSC_USE_CTABLE)
2588               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2589 #else
2590               tcol = cmap_i[rbuf3_i[ct2]];
2591 #endif
2592               if (tcol) lens_i[row]++;
2593             } else { /* allcolumns */
2594               lens_i[row]++; /* lens_i[row] += max2 ? */
2595             }
2596           }
2597         }
2598       }
2599     }
2600   }
2601   ierr = PetscFree(r_status3);CHKERRQ(ierr);
2602   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2603   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
2604   ierr = PetscFree(s_status3);CHKERRQ(ierr);
2605   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2606 
2607   /* Create the submatrices */
2608   if (scall == MAT_REUSE_MATRIX) {
2609     PetscBool flag;
2610 
2611     /*
2612         Assumes new rows are same length as the old rows,hence bug!
2613     */
2614     for (i=0; i<ismax; i++) {
2615       mat = (Mat_SeqAIJ*)(submats[i]->data);
2616       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2617       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2618       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
2619       /* Initial matrix as if empty */
2620       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2621 
2622       submats[i]->factortype = C->factortype;
2623     }
2624   } else {
2625     for (i=0; i<ismax; i++) {
2626       PetscInt rbs,cbs;
2627       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2628       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2629 
2630       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2631       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2632 
2633       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2634       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2635       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2636     }
2637   }
2638 
2639   /* Assemble the matrices */
2640   /* First assemble the local rows */
2641   {
2642     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2643     PetscScalar *imat_a;
2644 
2645     for (i=0; i<ismax; i++) {
2646       mat       = (Mat_SeqAIJ*)submats[i]->data;
2647       imat_ilen = mat->ilen;
2648       imat_j    = mat->j;
2649       imat_i    = mat->i;
2650       imat_a    = mat->a;
2651 
2652       if (!allcolumns[i]) cmap_i = cmap[i];
2653       rmap_i = rmap[i];
2654       irow_i = irow[i];
2655       jmax   = nrow[i];
2656       for (j=0; j<jmax; j++) {
2657         l   = 0;
2658         row = irow_i[j];
2659         while (row >= C->rmap->range[l+1]) l++;
2660         proc = l;
2661         if (proc == rank) {
2662           old_row = row;
2663 #if defined(PETSC_USE_CTABLE)
2664           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2665           row--;
2666 #else
2667           row = rmap_i[row];
2668 #endif
2669           ilen_row = imat_ilen[row];
2670           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2671           mat_i    = imat_i[row];
2672           mat_a    = imat_a + mat_i;
2673           mat_j    = imat_j + mat_i;
2674           if (!allcolumns[i]) {
2675             for (k=0; k<ncols; k++) {
2676 #if defined(PETSC_USE_CTABLE)
2677               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2678 #else
2679               tcol = cmap_i[cols[k]];
2680 #endif
2681               if (tcol) {
2682                 *mat_j++ = tcol - 1;
2683                 *mat_a++ = vals[k];
2684                 ilen_row++;
2685               }
2686             }
2687           } else { /* allcolumns */
2688             for (k=0; k<ncols; k++) {
2689               *mat_j++ = cols[k];  /* global col index! */
2690               *mat_a++ = vals[k];
2691               ilen_row++;
2692             }
2693           }
2694           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2695 
2696           imat_ilen[row] = ilen_row;
2697         }
2698       }
2699     }
2700   }
2701 
2702   /*   Now assemble the off proc rows*/
2703   {
2704     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
2705     PetscInt    *imat_j,*imat_i;
2706     PetscScalar *imat_a,*rbuf4_i;
2707 
2708     for (tmp2=0; tmp2<nrqs; tmp2++) {
2709       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
2710       idex    = pa[idex2];
2711       sbuf1_i = sbuf1[idex];
2712       jmax    = sbuf1_i[0];
2713       ct1     = 2*jmax + 1;
2714       ct2     = 0;
2715       rbuf2_i = rbuf2[idex2];
2716       rbuf3_i = rbuf3[idex2];
2717       rbuf4_i = rbuf4[idex2];
2718       for (j=1; j<=jmax; j++) {
2719         is_no     = sbuf1_i[2*j-1];
2720         rmap_i    = rmap[is_no];
2721         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2722         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
2723         imat_ilen = mat->ilen;
2724         imat_j    = mat->j;
2725         imat_i    = mat->i;
2726         imat_a    = mat->a;
2727         max1      = sbuf1_i[2*j];
2728         for (k=0; k<max1; k++,ct1++) {
2729           row = sbuf1_i[ct1];
2730 #if defined(PETSC_USE_CTABLE)
2731           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2732           row--;
2733 #else
2734           row = rmap_i[row];
2735 #endif
2736           ilen  = imat_ilen[row];
2737           mat_i = imat_i[row];
2738           mat_a = imat_a + mat_i;
2739           mat_j = imat_j + mat_i;
2740           max2  = rbuf2_i[ct1];
2741           if (!allcolumns[is_no]) {
2742             for (l=0; l<max2; l++,ct2++) {
2743 
2744 #if defined(PETSC_USE_CTABLE)
2745               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2746 #else
2747               tcol = cmap_i[rbuf3_i[ct2]];
2748 #endif
2749               if (tcol) {
2750                 *mat_j++ = tcol - 1;
2751                 *mat_a++ = rbuf4_i[ct2];
2752                 ilen++;
2753               }
2754             }
2755           } else { /* allcolumns */
2756             for (l=0; l<max2; l++,ct2++) {
2757               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2758               *mat_a++ = rbuf4_i[ct2];
2759               ilen++;
2760             }
2761           }
2762           imat_ilen[row] = ilen;
2763         }
2764       }
2765     }
2766   }
2767 
2768   /* sort the rows */
2769   {
2770     PetscInt    *imat_ilen,*imat_j,*imat_i;
2771     PetscScalar *imat_a;
2772 
2773     for (i=0; i<ismax; i++) {
2774       mat       = (Mat_SeqAIJ*)submats[i]->data;
2775       imat_j    = mat->j;
2776       imat_i    = mat->i;
2777       imat_a    = mat->a;
2778       imat_ilen = mat->ilen;
2779 
2780       if (allcolumns[i]) continue;
2781       jmax = nrow[i];
2782       for (j=0; j<jmax; j++) {
2783         PetscInt ilen;
2784 
2785         mat_i = imat_i[j];
2786         mat_a = imat_a + mat_i;
2787         mat_j = imat_j + mat_i;
2788         ilen  = imat_ilen[j];
2789         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
2790       }
2791     }
2792   }
2793 
2794   ierr = PetscFree(r_status4);CHKERRQ(ierr);
2795   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2796   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
2797   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2798   ierr = PetscFree(s_status4);CHKERRQ(ierr);
2799 
2800   /* Restore the indices */
2801   for (i=0; i<ismax; i++) {
2802     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2803     if (!allcolumns[i]) {
2804       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2805     }
2806   }
2807 
2808   /* Destroy allocated memory */
2809   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
2810   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2811   ierr = PetscFree(pa);CHKERRQ(ierr);
2812 
2813   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
2814   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
2815   for (i=0; i<nrqr; ++i) {
2816     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
2817   }
2818   for (i=0; i<nrqs; ++i) {
2819     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
2820     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2821   }
2822 
2823   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
2824   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
2825   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2826   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2827   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2828   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2829   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2830 
2831 #if defined(PETSC_USE_CTABLE)
2832   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
2833 #else
2834   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
2835 #endif
2836   ierr = PetscFree(rmap);CHKERRQ(ierr);
2837 
2838   for (i=0; i<ismax; i++) {
2839     if (!allcolumns[i]) {
2840 #if defined(PETSC_USE_CTABLE)
2841       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
2842 #else
2843       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
2844 #endif
2845     }
2846   }
2847   ierr = PetscFree(cmap);CHKERRQ(ierr);
2848   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2849   ierr = PetscFree(lens);CHKERRQ(ierr);
2850 
2851   for (i=0; i<ismax; i++) {
2852     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2853     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2854   }
2855   PetscFunctionReturn(0);
2856 }
2857 
2858 /*
2859  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2860  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2861  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2862  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2863  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2864  state, and needs to be "assembled" later by compressing B's column space.
2865 
2866  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2867  Following this call, C->A & C->B have been created, even if empty.
2868  */
2869 #undef __FUNCT__
2870 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
2871 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2872 {
2873   /* If making this function public, change the error returned in this function away from _PLIB. */
2874   PetscErrorCode ierr;
2875   Mat_MPIAIJ     *aij;
2876   Mat_SeqAIJ     *Baij;
2877   PetscBool      seqaij,Bdisassembled;
2878   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2879   PetscScalar    v;
2880   const PetscInt *rowindices,*colindices;
2881 
2882   PetscFunctionBegin;
2883   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2884   if (A) {
2885     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2886     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2887     if (rowemb) {
2888       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2889       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2890     } else {
2891       if (C->rmap->n != A->rmap->n) {
2892 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2893       }
2894     }
2895     if (dcolemb) {
2896       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2897       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2898     } else {
2899       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2900     }
2901   }
2902   if (B) {
2903     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2904     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2905     if (rowemb) {
2906       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2907       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2908     } else {
2909       if (C->rmap->n != B->rmap->n) {
2910 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2911       }
2912     }
2913     if (ocolemb) {
2914       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2915       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2916     } else {
2917       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2918     }
2919   }
2920 
2921   aij    = (Mat_MPIAIJ*)(C->data);
2922   if (!aij->A) {
2923     /* Mimic parts of MatMPIAIJSetPreallocation() */
2924     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2925     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2926     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2927     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2928     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2929   }
2930   if (A) {
2931     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2932   } else {
2933     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2934   }
2935   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2936     /*
2937       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2938       need to "disassemble" B -- convert it to using C's global indices.
2939       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2940 
2941       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2942       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2943 
2944       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2945       At least avoid calling MatSetValues() and the implied searches?
2946     */
2947 
2948     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2949 #if defined(PETSC_USE_CTABLE)
2950       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2951 #else
2952       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2953       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2954       if (aij->B) {
2955         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2956       }
2957 #endif
2958       ngcol = 0;
2959       if (aij->lvec) {
2960 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2961       }
2962       if (aij->garray) {
2963 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2964 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2965       }
2966       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2967       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2968     }
2969     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2970       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2971     }
2972     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2973       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2974     }
2975   }
2976   Bdisassembled = PETSC_FALSE;
2977   if (!aij->B) {
2978     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2979     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2980     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2981     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2982     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2983     Bdisassembled = PETSC_TRUE;
2984   }
2985   if (B) {
2986     Baij = (Mat_SeqAIJ*)(B->data);
2987     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2988       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2989       for (i=0; i<B->rmap->n; i++) {
2990 	nz[i] = Baij->i[i+1] - Baij->i[i];
2991       }
2992       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2993       ierr = PetscFree(nz);CHKERRQ(ierr);
2994     }
2995 
2996     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
2997     shift = rend-rstart;
2998     count = 0;
2999     rowindices = NULL;
3000     colindices = NULL;
3001     if (rowemb) {
3002       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3003     }
3004     if (ocolemb) {
3005       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3006     }
3007     for (i=0; i<B->rmap->n; i++) {
3008       PetscInt row;
3009       row = i;
3010       if (rowindices) row = rowindices[i];
3011       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3012 	col  = Baij->j[count];
3013 	if (colindices) col = colindices[col];
3014 	if (Bdisassembled && col>=rstart) col += shift;
3015 	v    = Baij->a[count];
3016 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3017 	++count;
3018       }
3019     }
3020     /* No assembly for aij->B is necessary. */
3021     /* FIXME: set aij->B's nonzerostate correctly. */
3022   } else {
3023     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3024   }
3025   C->preallocated  = PETSC_TRUE;
3026   C->was_assembled = PETSC_FALSE;
3027   C->assembled     = PETSC_FALSE;
3028    /*
3029       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3030       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3031    */
3032   PetscFunctionReturn(0);
3033 }
3034 
3035 #undef __FUNCT__
3036 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
3037 /*
3038   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3039  */
3040 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3041 {
3042   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
3043 
3044   PetscFunctionBegin;
3045   PetscValidPointer(A,2);
3046   PetscValidPointer(B,3);
3047   /* FIXME: make sure C is assembled */
3048   *A = aij->A;
3049   *B = aij->B;
3050   /* Note that we don't incref *A and *B, so be careful! */
3051   PetscFunctionReturn(0);
3052 }
3053 
3054 /*
3055   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3056   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3057 */
3058 #undef __FUNCT__
3059 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
3060 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3061                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3062 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3063 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3064 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3065 {
3066   PetscErrorCode ierr;
3067   PetscMPIInt    isize,flag;
3068   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
3069   Mat            *A,*B;
3070   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3071 
3072   PetscFunctionBegin;
3073   if (!ismax) PetscFunctionReturn(0);
3074 
3075   for (i = 0, cismax = 0; i < ismax; ++i) {
3076     PetscMPIInt isize;
3077     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
3078     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3079     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
3080     if (isize > 1) ++cismax;
3081   }
3082   /*
3083      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3084      ispar counts the number of parallel ISs across C's comm.
3085   */
3086   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
3087   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3088     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3089     PetscFunctionReturn(0);
3090   }
3091 
3092   /* if (ispar) */
3093   /*
3094     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3095     These are used to extract the off-diag portion of the resulting parallel matrix.
3096     The row IS for the off-diag portion is the same as for the diag portion,
3097     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3098   */
3099   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3100   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3101   for (i = 0, ii = 0; i < ismax; ++i) {
3102     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3103     if (isize > 1) {
3104       /*
3105 	 TODO: This is the part that's ***NOT SCALABLE***.
3106 	 To fix this we need to extract just the indices of C's nonzero columns
3107 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3108 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
3109 	 to be done without serializing on the IS list, so, most likely, it is best
3110 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
3111       */
3112       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3113       /* Now we have to
3114 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3115 	     were sorted on each rank, concatenated they might no longer be sorted;
3116 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3117 	     indices in the nondecreasing order to the original index positions.
3118 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3119       */
3120       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3121       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3122       ++ii;
3123     }
3124   }
3125   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3126   for (i = 0, ii = 0; i < ismax; ++i) {
3127     PetscInt       j,issize;
3128     const PetscInt *indices;
3129 
3130     /*
3131        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3132      */
3133     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3134     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3135     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3136     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3137     for (j = 1; j < issize; ++j) {
3138       if (indices[j] == indices[j-1]) {
3139 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3140       }
3141     }
3142     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3143 
3144 
3145     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3146     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3147     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3148     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3149     for (j = 1; j < issize; ++j) {
3150       if (indices[j-1] == indices[j]) {
3151 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3152       }
3153     }
3154     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3155     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3156     if (isize > 1) {
3157       cisrow[ii] = isrow[i];
3158       ++ii;
3159     }
3160   }
3161   /*
3162     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3163     array of sequential matrices underlying the resulting parallel matrices.
3164     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3165     contain duplicates.
3166 
3167     There are as many diag matrices as there are original index sets. There are only as many parallel
3168     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3169 
3170     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3171     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3172       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3173       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3174       with A[i] and B[ii] extracted from the corresponding MPI submat.
3175     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3176       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3177       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3178       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3179       values into A[i] and B[ii] sitting inside the corresponding submat.
3180     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3181       A[i], B[ii], AA[i] or BB[ii] matrices.
3182   */
3183   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3184   if (scall == MAT_INITIAL_MATRIX) {
3185     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3186     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
3187   } else {
3188     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
3189     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
3190     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
3191     for (i = 0, ii = 0; i < ismax; ++i) {
3192       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3193       if (isize > 1) {
3194 	Mat AA,BB;
3195 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
3196 	if (!isrow_p[i] && !iscol_p[i]) {
3197 	  A[i] = AA;
3198 	} else {
3199 	  /* TODO: extract A[i] composed on AA. */
3200 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3201 	}
3202 	if (!isrow_p[i] && !ciscol_p[ii]) {
3203 	  B[ii] = BB;
3204 	} else {
3205 	  /* TODO: extract B[ii] composed on BB. */
3206 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
3207 	}
3208 	++ii;
3209       } else {
3210 	if (!isrow_p[i] && !iscol_p[i]) {
3211 	  A[i] = (*submat)[i];
3212 	} else {
3213 	  /* TODO: extract A[i] composed on (*submat)[i]. */
3214 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
3215 	}
3216       }
3217     }
3218   }
3219   /* Now obtain the sequential A and B submatrices separately. */
3220   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
3221   /* I did not figure out a good way to fix it right now.
3222    * Local column size of B[i] is different from the size of ciscol[i].
3223    * B[i]'s size is finally determined by MatAssembly, while
3224    * ciscol[i] is computed as the complement of iscol[i].
3225    * It is better to keep only nonzero indices when computing
3226    * the complement ciscol[i].
3227    * */
3228   if(scall==MAT_REUSE_MATRIX){
3229 	for(i=0; i<cismax; i++){
3230 	  ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr);
3231 	  B[i]->cmap->n = ciscol_localsize;
3232 	}
3233   }
3234   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
3235   /*
3236     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3237     matrices A & B have been extracted directly into the parallel matrices containing them, or
3238     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3239     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3240     to have the same sparsity pattern.
3241     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3242     must be constructed for C. This is done by setseqmat(s).
3243   */
3244   for (i = 0, ii = 0; i < ismax; ++i) {
3245     /*
3246        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3247        That way we can avoid sorting and computing permutations when reusing.
3248        To this end:
3249         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3250 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
3251 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3252     */
3253     MatStructure pattern;
3254     if (scall == MAT_INITIAL_MATRIX) {
3255       pattern = DIFFERENT_NONZERO_PATTERN;
3256     } else {
3257       pattern = SAME_NONZERO_PATTERN;
3258     }
3259     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
3260     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3261     if (isize > 1) {
3262       if (scall == MAT_INITIAL_MATRIX) {
3263 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3264 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3265 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3266 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3267 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3268       }
3269       /*
3270 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3271       */
3272       {
3273 	Mat AA,BB;
3274 	AA = NULL;
3275 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
3276 	BB = NULL;
3277 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
3278 	if (AA || BB) {
3279 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3280 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3281 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3282 	}
3283 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
3284 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
3285 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
3286 	}
3287 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
3288 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
3289 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
3290 	}
3291       }
3292       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3293       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3294       ++ii;
3295     } else { /* if (isize == 1) */
3296       if (scall == MAT_INITIAL_MATRIX) {
3297 	if (isrow_p[i] || iscol_p[i]) {
3298 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3299 	} else (*submat)[i] = A[i];
3300       }
3301       if (isrow_p[i] || iscol_p[i]) {
3302 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3303 	/* Otherwise A is extracted straight into (*submats)[i]. */
3304 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3305 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
3306       }
3307     }
3308     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3309     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3310   }
3311   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3312   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3313   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3314   ierr = PetscFree(A);CHKERRQ(ierr);
3315   ierr = PetscFree(B);CHKERRQ(ierr);
3316   PetscFunctionReturn(0);
3317 }
3318 
3319 
3320 
3321 #undef __FUNCT__
3322 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
3323 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3324 {
3325   PetscErrorCode ierr;
3326 
3327   PetscFunctionBegin;
3328   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3329   PetscFunctionReturn(0);
3330 }
3331