xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision e5265faf5e3e5d0db06dbb9f30d95b1e01199e35)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscErrorCode ierr;
24   PetscInt       i;
25 
26   PetscFunctionBegin;
27   PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28   for (i=0; i<ov; ++i) {
29     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
30   }
31   PetscFunctionReturn(0);
32 }
33 
34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
35 {
36   PetscErrorCode ierr;
37   PetscInt       i;
38 
39   PetscFunctionBegin;
40   PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
41   for (i=0; i<ov; ++i) {
42     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
43   }
44   PetscFunctionReturn(0);
45 }
46 
47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
48 {
49   PetscErrorCode ierr;
50   MPI_Comm       comm;
51   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
52   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
53   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
54   const PetscInt *indices_i,**indices;
55   PetscLayout    rmap;
56   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
57   PetscSF        sf;
58   PetscSFNode    *remote;
59 
60   PetscFunctionBegin;
61   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
62   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
63   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
64   /* get row map to determine where rows should be going */
65   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
66   /* retrieve IS data and put all together so that we
67    * can optimize communication
68    *  */
69   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
70   for (i=0,tlength=0; i<nidx; i++) {
71     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
72     tlength += length[i];
73     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
74   }
75   /* find these rows on remote processors */
76   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
77   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
78   nrrows = 0;
79   for (i=0; i<nidx; i++) {
80     length_i  = length[i];
81     indices_i = indices[i];
82     for (j=0; j<length_i; j++) {
83       owner = -1;
84       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
85       /* remote processors */
86       if (owner != rank) {
87         tosizes_temp[owner]++; /* number of rows to owner */
88         rrow_ranks[nrrows]  = owner; /* processor */
89         rrow_isids[nrrows]   = i; /* is id */
90         remoterows[nrrows++] = indices_i[j]; /* row */
91       }
92     }
93     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
94   }
95   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
96   /* test if we need to exchange messages
97    * generally speaking, we do not need to exchange
98    * data when overlap is 1
99    * */
100   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr);
101   /* we do not have any messages
102    * It usually corresponds to overlap 1
103    * */
104   if (!reducednrrows) {
105     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
106     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
107     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
108     PetscFunctionReturn(0);
109   }
110   nto = 0;
111   /* send sizes and ranks for building a two-sided communcation */
112   for (i=0; i<size; i++) {
113     if (tosizes_temp[i]) {
114       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
115       tosizes_temp[i] = nto; /* a map from processor to index */
116       toranks[nto++]  = i; /* processor */
117     }
118   }
119   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
120   toffsets[0] = 0;
121   for (i=0; i<nto; i++) {
122     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
123     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
124   }
125   /* send information to other processors */
126   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
130   nrecvrows = 0;
131   for (i=0; i<nfrom; i++) {
132     for (j=0; j<fromsizes[2*i]; j++) {
133       remote[nrecvrows].rank    = fromranks[i];
134       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135     }
136   }
137   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
138   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
139   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
140   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
141   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
142   /* message pair <no of is, row>  */
143   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
144   for (i=0; i<nrrows; i++) {
145     owner = rrow_ranks[i]; /* processor */
146     j     = tosizes_temp[owner]; /* index */
147     todata[toffsets[j]++] = rrow_isids[i];
148     todata[toffsets[j]++] = remoterows[i];
149   }
150   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
151   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
152   ierr = PetscFree(toffsets);CHKERRQ(ierr);
153   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
154   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
155   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
156   /* send rows belonging to the remote so that then we could get the overlapping data back */
157   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
158   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
159   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
160   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
161   ierr = PetscFree(fromranks);CHKERRQ(ierr);
162   nrecvrows = 0;
163   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
164   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
165   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
166   nrecvrows = 0;
167   for (i=0; i<nto; i++) {
168     for (j=0; j<tosizes[2*i]; j++) {
169       remote[nrecvrows].rank    = toranks[i];
170       remote[nrecvrows++].index = tosizes[2*i+1]+j;
171     }
172   }
173   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
174   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
175   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
177   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
178   /* overlap communication and computation */
179   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
180   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
181   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
182   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
183   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
184   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
185   ierr = PetscFree(toranks);CHKERRQ(ierr);
186   ierr = PetscFree(tosizes);CHKERRQ(ierr);
187   ierr = PetscFree(todata);CHKERRQ(ierr);
188   PetscFunctionReturn(0);
189 }
190 
191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
192 {
193   PetscInt         *isz,isz_i,i,j,is_id, data_size;
194   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
195   const PetscInt   *indices_i_temp;
196   MPI_Comm         *iscomms;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++) {
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++) {
209     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
210     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
211     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
212     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
214   }
215   /* retrieve information to get row id and its overlap */
216   for (i=0; i<nrecvs;) {
217     is_id     = recvdata[i++];
218     data_size = recvdata[i++];
219     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
220     isz_i     = isz[is_id];
221     for (j=0; j< data_size; j++) {
222       col = recvdata[i++];
223       indices_i[isz_i++] = col;
224     }
225     isz[is_id] = isz_i;
226   }
227   /* remove duplicate entities */
228   for (i=0; i<nidx; i++) {
229     indices_i = indices_temp+(max_lsize+nrecvs)*i;
230     isz_i     = isz[i];
231     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
232     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
233     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
234   }
235   ierr = PetscFree(isz);CHKERRQ(ierr);
236   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
237   PetscFunctionReturn(0);
238 }
239 
240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
241 {
242   PetscLayout       rmap,cmap;
243   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
244   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
245   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
246   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
247   Mat               amat,bmat;
248   PetscMPIInt       rank;
249   PetscBool         done;
250   MPI_Comm          comm;
251   PetscErrorCode    ierr;
252 
253   PetscFunctionBegin;
254   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
255   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
256   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
257   /* Even if the mat is symmetric, we still assume it is not symmetric */
258   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
259   PetscCheckFalse(!done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
260   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
261   PetscCheckFalse(!done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
262   /* total number of nonzero values is used to estimate the memory usage in the next step */
263   tnz  = ai[an]+bi[bn];
264   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
265   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
267   /* to find the longest message */
268   max_fszs = 0;
269   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
270   /* better way to estimate number of nonzero in the mat??? */
271   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
272   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
273   rows_pos  = 0;
274   totalrows = 0;
275   for (i=0; i<nfrom; i++) {
276     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
277     /* group data together */
278     for (j=0; j<fromsizes[2*i]; j+=2) {
279       is_id                       = fromrows[rows_pos++];/* no of is */
280       rows_i                      = rows_data[is_id];
281       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
282     }
283     /* estimate a space to avoid multiple allocations  */
284     for (j=0; j<nidx; j++) {
285       indvc_ij = 0;
286       rows_i   = rows_data[j];
287       for (l=0; l<rows_pos_i[j]; l++) {
288         row    = rows_i[l]-rstart;
289         start  = ai[row];
290         end    = ai[row+1];
291         for (k=start; k<end; k++) { /* Amat */
292           col = aj[k] + cstart;
293           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
294         }
295         start = bi[row];
296         end   = bi[row+1];
297         for (k=start; k<end; k++) { /* Bmat */
298           col = gcols[bj[k]];
299           indices_tmp[indvc_ij++] = col;
300         }
301       }
302       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
303       indv_counts[i*nidx+j] = indvc_ij;
304       totalrows            += indvc_ij;
305     }
306   }
307   /* message triple <no of is, number of rows, rows> */
308   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
309   totalrows = 0;
310   rows_pos  = 0;
311   /* use this code again */
312   for (i=0;i<nfrom;i++) {
313     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
314     for (j=0; j<fromsizes[2*i]; j+=2) {
315       is_id                       = fromrows[rows_pos++];
316       rows_i                      = rows_data[is_id];
317       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
318     }
319     /* add data  */
320     for (j=0; j<nidx; j++) {
321       if (!indv_counts[i*nidx+j]) continue;
322       indvc_ij = 0;
323       sbdata[totalrows++] = j;
324       sbdata[totalrows++] = indv_counts[i*nidx+j];
325       sbsizes[2*i]       += 2;
326       rows_i              = rows_data[j];
327       for (l=0; l<rows_pos_i[j]; l++) {
328         row   = rows_i[l]-rstart;
329         start = ai[row];
330         end   = ai[row+1];
331         for (k=start; k<end; k++) { /* Amat */
332           col = aj[k] + cstart;
333           indices_tmp[indvc_ij++] = col;
334         }
335         start = bi[row];
336         end   = bi[row+1];
337         for (k=start; k<end; k++) { /* Bmat */
338           col = gcols[bj[k]];
339           indices_tmp[indvc_ij++] = col;
340         }
341       }
342       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
343       sbsizes[2*i]  += indvc_ij;
344       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
345       totalrows += indvc_ij;
346     }
347   }
348   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
349   offsets[0] = 0;
350   for (i=0; i<nfrom; i++) {
351     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352     sbsizes[2*i+1] = offsets[i];
353   }
354   ierr = PetscFree(offsets);CHKERRQ(ierr);
355   if (sbrowsizes) *sbrowsizes = sbsizes;
356   if (sbrows) *sbrows = sbdata;
357   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
359   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364 {
365   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367   PetscInt          lsize,lsize_tmp;
368   PetscMPIInt       rank,owner;
369   Mat               amat,bmat;
370   PetscBool         done;
371   PetscLayout       cmap,rmap;
372   MPI_Comm          comm;
373   PetscErrorCode    ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
377   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
378   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
379   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
380   PetscCheckFalse(!done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
381   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
382   PetscCheckFalse(!done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
383   /* is it a safe way to compute number of nonzero values ? */
384   tnz  = ai[an]+bi[bn];
385   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
387   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
388   /* it is a better way to estimate memory than the old implementation
389    * where global size of matrix is used
390    * */
391   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
392   for (i=0; i<nidx; i++) {
393     MPI_Comm iscomm;
394 
395     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
396     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
397     lsize_tmp = 0;
398     for (j=0; j<lsize; j++) {
399       owner = -1;
400       row   = indices[j];
401       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
402       if (owner != rank) continue;
403       /* local number */
404       row  -= rstart;
405       start = ai[row];
406       end   = ai[row+1];
407       for (k=start; k<end; k++) { /* Amat */
408         col = aj[k] + cstart;
409         indices_temp[lsize_tmp++] = col;
410       }
411       start = bi[row];
412       end   = bi[row+1];
413       for (k=start; k<end; k++) { /* Bmat */
414         col = gcols[bj[k]];
415         indices_temp[lsize_tmp++] = col;
416       }
417     }
418    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
419    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
420    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
421    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
422    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
423    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
424   }
425   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
426   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
427   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 /*
432   Sample message format:
433   If a processor A wants processor B to process some elements corresponding
434   to index sets is[1],is[5]
435   mesg [0] = 2   (no of index sets in the mesg)
436   -----------
437   mesg [1] = 1 => is[1]
438   mesg [2] = sizeof(is[1]);
439   -----------
440   mesg [3] = 5  => is[5]
441   mesg [4] = sizeof(is[5]);
442   -----------
443   mesg [5]
444   mesg [n]  datas[1]
445   -----------
446   mesg[n+1]
447   mesg[m]  data(is[5])
448   -----------
449 
450   Notes:
451   nrqs - no of requests sent (or to be sent out)
452   nrqr - no of requests received (which have to be or which have been processed)
453 */
454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
455 {
456   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
457   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
458   const PetscInt **idx,*idx_i;
459   PetscInt       *n,**data,len;
460 #if defined(PETSC_USE_CTABLE)
461   PetscTable     *table_data,table_data_i;
462   PetscInt       *tdata,tcount,tcount_max;
463 #else
464   PetscInt       *data_i,*d_p;
465 #endif
466   PetscErrorCode ierr;
467   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
468   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
469   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
470   PetscBT        *table;
471   MPI_Comm       comm;
472   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
473   MPI_Status     *recv_status;
474   MPI_Comm       *iscomms;
475   char           *t_p;
476 
477   PetscFunctionBegin;
478   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
479   size = c->size;
480   rank = c->rank;
481   M    = C->rmap->N;
482 
483   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
485 
486   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
487 
488   for (i=0; i<imax; i++) {
489     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
490     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
491   }
492 
493   /* evaluate communication - mesg to who,length of mesg, and buffer space
494      required. Based on this, buffers are allocated, and data copied into them  */
495   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
496   for (i=0; i<imax; i++) {
497     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
498     idx_i = idx[i];
499     len   = n[i];
500     for (j=0; j<len; j++) {
501       row = idx_i[j];
502       PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
503       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
504       w4[proc]++;
505     }
506     for (j=0; j<size; j++) {
507       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
508     }
509   }
510 
511   nrqs     = 0;              /* no of outgoing messages */
512   msz      = 0;              /* total mesg length (for all proc */
513   w1[rank] = 0;              /* no mesg sent to intself */
514   w3[rank] = 0;
515   for (i=0; i<size; i++) {
516     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
517   }
518   /* pa - is list of processors to communicate with */
519   ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr);
520   for (i=0,j=0; i<size; i++) {
521     if (w1[i]) {pa[j] = i; j++;}
522   }
523 
524   /* Each message would have a header = 1 + 2*(no of IS) + data */
525   for (i=0; i<nrqs; i++) {
526     j      = pa[i];
527     w1[j] += w2[j] + 2*w3[j];
528     msz   += w1[j];
529   }
530 
531   /* Determine the number of messages to expect, their lengths, from from-ids */
532   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
533   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
534 
535   /* Now post the Irecvs corresponding to these messages */
536   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
537 
538   /* Allocate Memory for outgoing messages */
539   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
540   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
541   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
542 
543   {
544     PetscInt *iptr = tmp,ict  = 0;
545     for (i=0; i<nrqs; i++) {
546       j         = pa[i];
547       iptr     +=  ict;
548       outdat[j] = iptr;
549       ict       = w1[j];
550     }
551   }
552 
553   /* Form the outgoing messages */
554   /* plug in the headers */
555   for (i=0; i<nrqs; i++) {
556     j            = pa[i];
557     outdat[j][0] = 0;
558     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
559     ptr[j]       = outdat[j] + 2*w3[j] + 1;
560   }
561 
562   /* Memory for doing local proc's work */
563   {
564     PetscInt M_BPB_imax = 0;
565 #if defined(PETSC_USE_CTABLE)
566     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
567     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       ierr = PetscTableCreate(n[i],M,&table_data[i]);CHKERRQ(ierr);
570     }
571     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
572     for (i=0; i<imax; i++) {
573       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
574     }
575 #else
576     PetscInt Mimax = 0;
577     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
578     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
579     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
580     for (i=0; i<imax; i++) {
581       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582       data[i]  = d_p + M*i;
583     }
584 #endif
585   }
586 
587   /* Parse the IS and update local tables and the outgoing buf with the data */
588   {
589     PetscInt n_i,isz_i,*outdat_j,ctr_j;
590     PetscBT  table_i;
591 
592     for (i=0; i<imax; i++) {
593       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
594       n_i     = n[i];
595       table_i = table[i];
596       idx_i   = idx[i];
597 #if defined(PETSC_USE_CTABLE)
598       table_data_i = table_data[i];
599 #else
600       data_i  = data[i];
601 #endif
602       isz_i   = isz[i];
603       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
604         row  = idx_i[j];
605         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
606         if (proc != rank) { /* copy to the outgoing buffer */
607           ctr[proc]++;
608           *ptr[proc] = row;
609           ptr[proc]++;
610         } else if (!PetscBTLookupSet(table_i,row)) {
611 #if defined(PETSC_USE_CTABLE)
612           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
613 #else
614           data_i[isz_i] = row; /* Update the local table */
615 #endif
616           isz_i++;
617         }
618       }
619       /* Update the headers for the current IS */
620       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
621         if ((ctr_j = ctr[j])) {
622           outdat_j        = outdat[j];
623           k               = ++outdat_j[0];
624           outdat_j[2*k]   = ctr_j;
625           outdat_j[2*k-1] = i;
626         }
627       }
628       isz[i] = isz_i;
629     }
630   }
631 
632   /*  Now  post the sends */
633   ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
634   for (i=0; i<nrqs; ++i) {
635     j    = pa[i];
636     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
637   }
638 
639   /* No longer need the original indices */
640   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
641   for (i=0; i<imax; ++i) {
642     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
643     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
644   }
645   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
646 
647   for (i=0; i<imax; ++i) {
648     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
649   }
650 
651   /* Do Local work */
652 #if defined(PETSC_USE_CTABLE)
653   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
654 #else
655   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
656 #endif
657 
658   /* Receive messages */
659   ierr = PetscMalloc1(nrqr,&recv_status);CHKERRQ(ierr);
660   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);
661   ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
662 
663   /* Phase 1 sends are complete - deallocate buffers */
664   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
665   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
666 
667   ierr = PetscMalloc1(nrqr,&xdata);CHKERRQ(ierr);
668   ierr = PetscMalloc1(nrqr,&isz1);CHKERRQ(ierr);
669   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
670   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
671   ierr = PetscFree(rbuf);CHKERRQ(ierr);
672 
673   /* Send the data back */
674   /* Do a global reduction to know the buffer space req for incoming messages */
675   {
676     PetscMPIInt *rw1;
677 
678     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
679 
680     for (i=0; i<nrqr; ++i) {
681       proc = recv_status[i].MPI_SOURCE;
682 
683       PetscCheckFalse(proc != onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
684       rw1[proc] = isz1[i];
685     }
686     ierr = PetscFree(onodes1);CHKERRQ(ierr);
687     ierr = PetscFree(olengths1);CHKERRQ(ierr);
688 
689     /* Determine the number of messages to expect, their lengths, from from-ids */
690     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
691     ierr = PetscFree(rw1);CHKERRQ(ierr);
692   }
693   /* Now post the Irecvs corresponding to these messages */
694   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
695 
696   /* Now  post the sends */
697   ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr);
698   for (i=0; i<nrqr; ++i) {
699     j    = recv_status[i].MPI_SOURCE;
700     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr);
701   }
702 
703   /* receive work done on other processors */
704   {
705     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
706     PetscMPIInt idex;
707     PetscBT     table_i;
708 
709     for (i=0; i<nrqs; ++i) {
710       ierr = MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);CHKERRMPI(ierr);
711       /* Process the message */
712       rbuf2_i = rbuf2[idex];
713       ct1     = 2*rbuf2_i[0]+1;
714       jmax    = rbuf2[idex][0];
715       for (j=1; j<=jmax; j++) {
716         max     = rbuf2_i[2*j];
717         is_no   = rbuf2_i[2*j-1];
718         isz_i   = isz[is_no];
719         table_i = table[is_no];
720 #if defined(PETSC_USE_CTABLE)
721         table_data_i = table_data[is_no];
722 #else
723         data_i  = data[is_no];
724 #endif
725         for (k=0; k<max; k++,ct1++) {
726           row = rbuf2_i[ct1];
727           if (!PetscBTLookupSet(table_i,row)) {
728 #if defined(PETSC_USE_CTABLE)
729             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
730 #else
731             data_i[isz_i] = row;
732 #endif
733             isz_i++;
734           }
735         }
736         isz[is_no] = isz_i;
737       }
738     }
739 
740     ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
768   }
769 
770   ierr = PetscFree(iscomms);CHKERRQ(ierr);
771   ierr = PetscFree(onodes2);CHKERRQ(ierr);
772   ierr = PetscFree(olengths2);CHKERRQ(ierr);
773 
774   ierr = PetscFree(pa);CHKERRQ(ierr);
775   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
776   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
779   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
780   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
781   ierr = PetscFree(recv_status);CHKERRQ(ierr);
782   if (xdata) {
783     ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
784     ierr = PetscFree(xdata);CHKERRQ(ierr);
785   }
786   ierr = PetscFree(isz1);CHKERRQ(ierr);
787 #if defined(PETSC_USE_CTABLE)
788   for (i=0; i<imax; i++) {
789     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
790   }
791   ierr = PetscFree(table_data);CHKERRQ(ierr);
792   ierr = PetscFree(tdata);CHKERRQ(ierr);
793   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
794 #else
795   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
796 #endif
797   PetscFunctionReturn(0);
798 }
799 
800 /*
801    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
802        the work on the local processor.
803 
804      Inputs:
805       C      - MAT_MPIAIJ;
806       imax - total no of index sets processed at a time;
807       table  - an array of char - size = m bits.
808 
809      Output:
810       isz    - array containing the count of the solution elements corresponding
811                to each index set;
812       data or table_data  - pointer to the solutions
813 */
814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
815 {
816   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
817   Mat        A  = c->A,B = c->B;
818   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
819   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
820   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
821   PetscBT    table_i;
822 #if defined(PETSC_USE_CTABLE)
823   PetscTable         table_data_i;
824   PetscErrorCode     ierr;
825   PetscTablePosition tpos;
826   PetscInt           tcount,*tdata;
827 #else
828   PetscInt           *data_i;
829 #endif
830 
831   PetscFunctionBegin;
832   rstart = C->rmap->rstart;
833   cstart = C->cmap->rstart;
834   ai     = a->i;
835   aj     = a->j;
836   bi     = b->i;
837   bj     = b->j;
838   garray = c->garray;
839 
840   for (i=0; i<imax; i++) {
841 #if defined(PETSC_USE_CTABLE)
842     /* copy existing entries of table_data_i into tdata[] */
843     table_data_i = table_data[i];
844     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
845     PetscCheckFalse(tcount != isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]);
846 
847     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
848     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
849     while (tpos) {
850       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
851       tdata[--j] = --row;
852       PetscCheckFalse(j > tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount);
853     }
854 #else
855     data_i  = data[i];
856 #endif
857     table_i = table[i];
858     isz_i   = isz[i];
859     max     = isz[i];
860 
861     for (j=0; j<max; j++) {
862 #if defined(PETSC_USE_CTABLE)
863       row   = tdata[j] - rstart;
864 #else
865       row   = data_i[j] - rstart;
866 #endif
867       start = ai[row];
868       end   = ai[row+1];
869       for (k=start; k<end; k++) { /* Amat */
870         val = aj[k] + cstart;
871         if (!PetscBTLookupSet(table_i,val)) {
872 #if defined(PETSC_USE_CTABLE)
873           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
874 #else
875           data_i[isz_i] = val;
876 #endif
877           isz_i++;
878         }
879       }
880       start = bi[row];
881       end   = bi[row+1];
882       for (k=start; k<end; k++) { /* Bmat */
883         val = garray[bj[k]];
884         if (!PetscBTLookupSet(table_i,val)) {
885 #if defined(PETSC_USE_CTABLE)
886           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
887 #else
888           data_i[isz_i] = val;
889 #endif
890           isz_i++;
891         }
892       }
893     }
894     isz[i] = isz_i;
895 
896 #if defined(PETSC_USE_CTABLE)
897     ierr = PetscFree(tdata);CHKERRQ(ierr);
898 #endif
899   }
900   PetscFunctionReturn(0);
901 }
902 
903 /*
904       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
905          and return the output
906 
907          Input:
908            C    - the matrix
909            nrqr - no of messages being processed.
910            rbuf - an array of pointers to the received requests
911 
912          Output:
913            xdata - array of messages to be sent back
914            isz1  - size of each message
915 
916   For better efficiency perhaps we should malloc separately each xdata[i],
917 then if a remalloc is required we need only copy the data for that one row
918 rather then all previous rows as it is now where a single large chunk of
919 memory is used.
920 
921 */
922 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
923 {
924   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
925   Mat            A  = c->A,B = c->B;
926   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
927   PetscErrorCode ierr;
928   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
929   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
930   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
931   PetscInt       *rbuf_i,kmax,rbuf_0;
932   PetscBT        xtable;
933 
934   PetscFunctionBegin;
935   m      = C->rmap->N;
936   rstart = C->rmap->rstart;
937   cstart = C->cmap->rstart;
938   ai     = a->i;
939   aj     = a->j;
940   bi     = b->i;
941   bj     = b->j;
942   garray = c->garray;
943 
944   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
945     rbuf_i =  rbuf[i];
946     rbuf_0 =  rbuf_i[0];
947     ct    += rbuf_0;
948     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
949   }
950 
951   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
952   else max1 = 1;
953   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
954   if (nrqr) {
955     ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
956     ++no_malloc;
957   }
958   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
959   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
960 
961   ct3 = 0;
962   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
963     rbuf_i =  rbuf[i];
964     rbuf_0 =  rbuf_i[0];
965     ct1    =  2*rbuf_0+1;
966     ct2    =  ct1;
967     ct3   += ct1;
968     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
969       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
970       oct2 = ct2;
971       kmax = rbuf_i[2*j];
972       for (k=0; k<kmax; k++,ct1++) {
973         row = rbuf_i[ct1];
974         if (!PetscBTLookupSet(xtable,row)) {
975           if (!(ct3 < mem_estimate)) {
976             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
977             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
978             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
979             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
980             xdata[0]     = tmp;
981             mem_estimate = new_estimate; ++no_malloc;
982             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
983           }
984           xdata[i][ct2++] = row;
985           ct3++;
986         }
987       }
988       for (k=oct2,max2=ct2; k<max2; k++) {
989         row   = xdata[i][k] - rstart;
990         start = ai[row];
991         end   = ai[row+1];
992         for (l=start; l<end; l++) {
993           val = aj[l] + cstart;
994           if (!PetscBTLookupSet(xtable,val)) {
995             if (!(ct3 < mem_estimate)) {
996               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
997               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
998               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
999               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1000               xdata[0]     = tmp;
1001               mem_estimate = new_estimate; ++no_malloc;
1002               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1003             }
1004             xdata[i][ct2++] = val;
1005             ct3++;
1006           }
1007         }
1008         start = bi[row];
1009         end   = bi[row+1];
1010         for (l=start; l<end; l++) {
1011           val = garray[bj[l]];
1012           if (!PetscBTLookupSet(xtable,val)) {
1013             if (!(ct3 < mem_estimate)) {
1014               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1015               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1016               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1017               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1018               xdata[0]     = tmp;
1019               mem_estimate = new_estimate; ++no_malloc;
1020               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1021             }
1022             xdata[i][ct2++] = val;
1023             ct3++;
1024           }
1025         }
1026       }
1027       /* Update the header*/
1028       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1029       xdata[i][2*j-1] = rbuf_i[2*j-1];
1030     }
1031     xdata[i][0] = rbuf_0;
1032     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1033     isz1[i]     = ct2; /* size of each message */
1034   }
1035   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1036   ierr = PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1037   PetscFunctionReturn(0);
1038 }
1039 /* -------------------------------------------------------------------------*/
1040 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1041 /*
1042     Every processor gets the entire matrix
1043 */
1044 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1045 {
1046   Mat            B;
1047   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1048   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1049   PetscErrorCode ierr;
1050   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1051   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1052   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1053 
1054   PetscFunctionBegin;
1055   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1056   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr);
1057   if (scall == MAT_INITIAL_MATRIX) {
1058     /* ----------------------------------------------------------------
1059          Tell every processor the number of nonzeros per row
1060     */
1061     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1062     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1063       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1064     }
1065     ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1066     for (i=0; i<size; i++) {
1067       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1068       displs[i]     = A->rmap->range[i];
1069     }
1070     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1071     /* ---------------------------------------------------------------
1072          Create the sequential matrix of the same type as the local block diagonal
1073     */
1074     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1075     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1076     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1077     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1078     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1079     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1080     **Bin = B;
1081     b     = (Mat_SeqAIJ*)B->data;
1082 
1083     /*--------------------------------------------------------------------
1084        Copy my part of matrix column indices over
1085     */
1086     sendcount  = ad->nz + bd->nz;
1087     jsendbuf   = b->j + b->i[rstarts[rank]];
1088     a_jsendbuf = ad->j;
1089     b_jsendbuf = bd->j;
1090     n          = A->rmap->rend - A->rmap->rstart;
1091     cnt        = 0;
1092     for (i=0; i<n; i++) {
1093       /* put in lower diagonal portion */
1094       m = bd->i[i+1] - bd->i[i];
1095       while (m > 0) {
1096         /* is it above diagonal (in bd (compressed) numbering) */
1097         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1098         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1099         m--;
1100       }
1101 
1102       /* put in diagonal portion */
1103       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1104         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1105       }
1106 
1107       /* put in upper diagonal portion */
1108       while (m-- > 0) {
1109         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1110       }
1111     }
1112     PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1113 
1114     /*--------------------------------------------------------------------
1115        Gather all column indices to all processors
1116     */
1117     for (i=0; i<size; i++) {
1118       recvcounts[i] = 0;
1119       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1120         recvcounts[i] += lens[j];
1121       }
1122     }
1123     displs[0] = 0;
1124     for (i=1; i<size; i++) {
1125       displs[i] = displs[i-1] + recvcounts[i-1];
1126     }
1127     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1128     /*--------------------------------------------------------------------
1129         Assemble the matrix into useable form (note numerical values not yet set)
1130     */
1131     /* set the b->ilen (length of each row) values */
1132     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1133     /* set the b->i indices */
1134     b->i[0] = 0;
1135     for (i=1; i<=A->rmap->N; i++) {
1136       b->i[i] = b->i[i-1] + lens[i-1];
1137     }
1138     ierr = PetscFree(lens);CHKERRQ(ierr);
1139     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1140     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1141 
1142   } else {
1143     B = **Bin;
1144     b = (Mat_SeqAIJ*)B->data;
1145   }
1146 
1147   /*--------------------------------------------------------------------
1148        Copy my part of matrix numerical values into the values location
1149   */
1150   if (flag == MAT_GET_VALUES) {
1151     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1152     MatScalar         *sendbuf,*recvbuf;
1153 
1154     ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr);
1155     ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr);
1156     sendcount = ad->nz + bd->nz;
1157     sendbuf   = b->a + b->i[rstarts[rank]];
1158     a_sendbuf = ada;
1159     b_sendbuf = bda;
1160     b_sendj   = bd->j;
1161     n         = A->rmap->rend - A->rmap->rstart;
1162     cnt       = 0;
1163     for (i=0; i<n; i++) {
1164       /* put in lower diagonal portion */
1165       m = bd->i[i+1] - bd->i[i];
1166       while (m > 0) {
1167         /* is it above diagonal (in bd (compressed) numbering) */
1168         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1169         sendbuf[cnt++] = *b_sendbuf++;
1170         m--;
1171         b_sendj++;
1172       }
1173 
1174       /* put in diagonal portion */
1175       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1176         sendbuf[cnt++] = *a_sendbuf++;
1177       }
1178 
1179       /* put in upper diagonal portion */
1180       while (m-- > 0) {
1181         sendbuf[cnt++] = *b_sendbuf++;
1182         b_sendj++;
1183       }
1184     }
1185     PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1186 
1187     /* -----------------------------------------------------------------
1188        Gather all numerical values to all processors
1189     */
1190     if (!recvcounts) {
1191       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1192     }
1193     for (i=0; i<size; i++) {
1194       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1195     }
1196     displs[0] = 0;
1197     for (i=1; i<size; i++) {
1198       displs[i] = displs[i-1] + recvcounts[i-1];
1199     }
1200     recvbuf = b->a;
1201     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1202     ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr);
1203     ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr);
1204   }  /* endof (flag == MAT_GET_VALUES) */
1205   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1206 
1207   if (A->symmetric) {
1208     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1209   } else if (A->hermitian) {
1210     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1211   } else if (A->structurally_symmetric) {
1212     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1213   }
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1218 {
1219   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1220   Mat            submat,A = c->A,B = c->B;
1221   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1222   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1223   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1224   const PetscInt *icol,*irow;
1225   PetscInt       nrow,ncol,start;
1226   PetscErrorCode ierr;
1227   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1228   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1229   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1230   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1231   PetscInt       *lens,rmax,ncols,*cols,Crow;
1232 #if defined(PETSC_USE_CTABLE)
1233   PetscTable     cmap,rmap;
1234   PetscInt       *cmap_loc,*rmap_loc;
1235 #else
1236   PetscInt       *cmap,*rmap;
1237 #endif
1238   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1239   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1240   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1241   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1242   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1243   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1244   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1245   MPI_Comm       comm;
1246   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1247   PetscMPIInt    *onodes1,*olengths1,idex,end;
1248   Mat_SubSppt    *smatis1;
1249   PetscBool      isrowsorted,iscolsorted;
1250 
1251   PetscFunctionBegin;
1252   PetscValidLogicalCollectiveInt(C,ismax,2);
1253   PetscValidLogicalCollectiveEnum(C,scall,5);
1254   PetscCheckFalse(ismax != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1255   ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1256   ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1257   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1258   size = c->size;
1259   rank = c->rank;
1260 
1261   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1262   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1263   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1264   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1265   if (allcolumns) {
1266     icol = NULL;
1267     ncol = C->cmap->N;
1268   } else {
1269     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1270     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1271   }
1272 
1273   if (scall == MAT_INITIAL_MATRIX) {
1274     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1275 
1276     /* Get some new tags to keep the communication clean */
1277     tag1 = ((PetscObject)C)->tag;
1278     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1279     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1280 
1281     /* evaluate communication - mesg to who, length of mesg, and buffer space
1282      required. Based on this, buffers are allocated, and data copied into them */
1283     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1284     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1285 
1286     /* w1[proc] = num of rows owned by proc -- to be requested */
1287     proc = 0;
1288     nrqs = 0; /* num of outgoing messages */
1289     for (j=0; j<nrow; j++) {
1290       row  = irow[j];
1291       if (!isrowsorted) proc = 0;
1292       while (row >= C->rmap->range[proc+1]) proc++;
1293       w1[proc]++;
1294       row2proc[j] = proc; /* map row index to proc */
1295 
1296       if (proc != rank && !w2[proc]) {
1297         w2[proc] = 1; nrqs++;
1298       }
1299     }
1300     w1[rank] = 0;  /* rows owned by self will not be requested */
1301 
1302     ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/
1303     for (proc=0,j=0; proc<size; proc++) {
1304       if (w1[proc]) { pa[j++] = proc;}
1305     }
1306 
1307     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1308     msz = 0;              /* total mesg length (for all procs) */
1309     for (i=0; i<nrqs; i++) {
1310       proc      = pa[i];
1311       w1[proc] += 3;
1312       msz      += w1[proc];
1313     }
1314     ierr = PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz);CHKERRQ(ierr);
1315 
1316     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1317     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1318     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1319 
1320     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1321        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1322     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1323 
1324     /* Now post the Irecvs corresponding to these messages */
1325     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1326 
1327     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1328     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1329 
1330     /* Allocate Memory for outgoing messages */
1331     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1332     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1333     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1334 
1335     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1336     iptr = tmp;
1337     for (i=0; i<nrqs; i++) {
1338       proc        = pa[i];
1339       sbuf1[proc] = iptr;
1340       iptr       += w1[proc];
1341     }
1342 
1343     /* Form the outgoing messages */
1344     /* Initialize the header space */
1345     for (i=0; i<nrqs; i++) {
1346       proc      = pa[i];
1347       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1348       ptr[proc] = sbuf1[proc] + 3;
1349     }
1350 
1351     /* Parse the isrow and copy data into outbuf */
1352     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1353     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1354       proc = row2proc[j];
1355       if (proc != rank) { /* copy to the outgoing buf*/
1356         *ptr[proc] = irow[j];
1357         ctr[proc]++; ptr[proc]++;
1358       }
1359     }
1360 
1361     /* Update the headers for the current IS */
1362     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1363       if ((ctr_j = ctr[j])) {
1364         sbuf1_j        = sbuf1[j];
1365         k              = ++sbuf1_j[0];
1366         sbuf1_j[2*k]   = ctr_j;
1367         sbuf1_j[2*k-1] = 0;
1368       }
1369     }
1370 
1371     /* Now post the sends */
1372     ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
1373     for (i=0; i<nrqs; ++i) {
1374       proc = pa[i];
1375       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
1376     }
1377 
1378     /* Post Receives to capture the buffer size */
1379     ierr = PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);CHKERRQ(ierr);
1380     ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr);
1381 
1382     if (nrqs) rbuf2[0] = tmp + msz;
1383     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1384 
1385     for (i=0; i<nrqs; ++i) {
1386       proc = pa[i];
1387       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
1388     }
1389 
1390     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1391 
1392     /* Send to other procs the buf size they should allocate */
1393     /* Receive messages*/
1394     ierr = PetscMalloc1(nrqr,&r_status1);CHKERRQ(ierr);
1395     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1396 
1397     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
1398     for (i=0; i<nrqr; ++i) {
1399       req_size[i] = 0;
1400       rbuf1_i        = rbuf1[i];
1401       start          = 2*rbuf1_i[0] + 1;
1402       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
1403       ierr           = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr);
1404       sbuf2_i        = sbuf2[i];
1405       for (j=start; j<end; j++) {
1406         k            = rbuf1_i[j] - rstart;
1407         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1408         sbuf2_i[j]   = ncols;
1409         req_size[i] += ncols;
1410       }
1411       req_source1[i] = r_status1[i].MPI_SOURCE;
1412 
1413       /* form the header */
1414       sbuf2_i[0] = req_size[i];
1415       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1416 
1417       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
1418     }
1419 
1420     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1421     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1422 
1423     /* rbuf2 is received, Post recv column indices a->j */
1424     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
1425 
1426     ierr = PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);CHKERRQ(ierr);
1427     for (i=0; i<nrqs; ++i) {
1428       ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr);
1429       req_source2[i] = r_status2[i].MPI_SOURCE;
1430       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
1431     }
1432 
1433     /* Wait on sends1 and sends2 */
1434     ierr = PetscMalloc1(nrqs,&s_status1);CHKERRQ(ierr);
1435     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);
1436     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1437     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1438 
1439     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);
1440     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1441 
1442     /* Now allocate sending buffers for a->j, and send them off */
1443     ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr);
1444     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1445     if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);}
1446     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1447 
1448     for (i=0; i<nrqr; i++) { /* for each requested message */
1449       rbuf1_i   = rbuf1[i];
1450       sbuf_aj_i = sbuf_aj[i];
1451       ct1       = 2*rbuf1_i[0] + 1;
1452       ct2       = 0;
1453       /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1454 
1455       kmax = rbuf1[i][2];
1456       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1457         row    = rbuf1_i[ct1] - rstart;
1458         nzA    = ai[row+1] - ai[row];
1459         nzB    = bi[row+1] - bi[row];
1460         ncols  = nzA + nzB;
1461         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1462 
1463         /* load the column indices for this row into cols*/
1464         cols = sbuf_aj_i + ct2;
1465 
1466         lwrite = 0;
1467         for (l=0; l<nzB; l++) {
1468           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1469         }
1470         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1471         for (l=0; l<nzB; l++) {
1472           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1473         }
1474 
1475         ct2 += ncols;
1476       }
1477       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
1478     }
1479 
1480     /* create column map (cmap): global col of C -> local col of submat */
1481 #if defined(PETSC_USE_CTABLE)
1482     if (!allcolumns) {
1483       ierr = PetscTableCreate(ncol,C->cmap->N,&cmap);CHKERRQ(ierr);
1484       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1485       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1486         if (icol[j] >= cstart && icol[j] <cend) {
1487           cmap_loc[icol[j] - cstart] = j+1;
1488         } else { /* use PetscTable for non-local col indices */
1489           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1490         }
1491       }
1492     } else {
1493       cmap     = NULL;
1494       cmap_loc = NULL;
1495     }
1496     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1497 #else
1498     if (!allcolumns) {
1499       ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1500       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1501     } else {
1502       cmap = NULL;
1503     }
1504 #endif
1505 
1506     /* Create lens for MatSeqAIJSetPreallocation() */
1507     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1508 
1509     /* Compute lens from local part of C */
1510     for (j=0; j<nrow; j++) {
1511       row  = irow[j];
1512       proc = row2proc[j];
1513       if (proc == rank) {
1514         /* diagonal part A = c->A */
1515         ncols = ai[row-rstart+1] - ai[row-rstart];
1516         cols  = aj + ai[row-rstart];
1517         if (!allcolumns) {
1518           for (k=0; k<ncols; k++) {
1519 #if defined(PETSC_USE_CTABLE)
1520             tcol = cmap_loc[cols[k]];
1521 #else
1522             tcol = cmap[cols[k]+cstart];
1523 #endif
1524             if (tcol) lens[j]++;
1525           }
1526         } else { /* allcolumns */
1527           lens[j] = ncols;
1528         }
1529 
1530         /* off-diagonal part B = c->B */
1531         ncols = bi[row-rstart+1] - bi[row-rstart];
1532         cols  = bj + bi[row-rstart];
1533         if (!allcolumns) {
1534           for (k=0; k<ncols; k++) {
1535 #if defined(PETSC_USE_CTABLE)
1536             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1537 #else
1538             tcol = cmap[bmap[cols[k]]];
1539 #endif
1540             if (tcol) lens[j]++;
1541           }
1542         } else { /* allcolumns */
1543           lens[j] += ncols;
1544         }
1545       }
1546     }
1547 
1548     /* Create row map (rmap): global row of C -> local row of submat */
1549 #if defined(PETSC_USE_CTABLE)
1550     ierr = PetscTableCreate(nrow,C->rmap->N,&rmap);CHKERRQ(ierr);
1551     for (j=0; j<nrow; j++) {
1552       row  = irow[j];
1553       proc = row2proc[j];
1554       if (proc == rank) { /* a local row */
1555         rmap_loc[row - rstart] = j;
1556       } else {
1557         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1558       }
1559     }
1560 #else
1561     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1562     for (j=0; j<nrow; j++) {
1563       rmap[irow[j]] = j;
1564     }
1565 #endif
1566 
1567     /* Update lens from offproc data */
1568     /* recv a->j is done */
1569     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
1570     for (i=0; i<nrqs; i++) {
1571       proc    = pa[i];
1572       sbuf1_i = sbuf1[proc];
1573       /* jmax    = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1574       ct1     = 2 + 1;
1575       ct2     = 0;
1576       rbuf2_i = rbuf2[i]; /* received length of C->j */
1577       rbuf3_i = rbuf3[i]; /* received C->j */
1578 
1579       /* is_no  = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1580       max1   = sbuf1_i[2];
1581       for (k=0; k<max1; k++,ct1++) {
1582 #if defined(PETSC_USE_CTABLE)
1583         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1584         row--;
1585         PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1586 #else
1587         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1588 #endif
1589         /* Now, store row index of submat in sbuf1_i[ct1] */
1590         sbuf1_i[ct1] = row;
1591 
1592         nnz = rbuf2_i[ct1];
1593         if (!allcolumns) {
1594           for (l=0; l<nnz; l++,ct2++) {
1595 #if defined(PETSC_USE_CTABLE)
1596             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1597               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1598             } else {
1599               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1600             }
1601 #else
1602             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1603 #endif
1604             if (tcol) lens[row]++;
1605           }
1606         } else { /* allcolumns */
1607           lens[row] += nnz;
1608         }
1609       }
1610     }
1611     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);
1612     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1613 
1614     /* Create the submatrices */
1615     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1616     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1617 
1618     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1619     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1620     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1621     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1622     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1623 
1624     /* create struct Mat_SubSppt and attached it to submat */
1625     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1626     subc = (Mat_SeqAIJ*)submat->data;
1627     subc->submatis1 = smatis1;
1628 
1629     smatis1->id          = 0;
1630     smatis1->nrqs        = nrqs;
1631     smatis1->nrqr        = nrqr;
1632     smatis1->rbuf1       = rbuf1;
1633     smatis1->rbuf2       = rbuf2;
1634     smatis1->rbuf3       = rbuf3;
1635     smatis1->sbuf2       = sbuf2;
1636     smatis1->req_source2 = req_source2;
1637 
1638     smatis1->sbuf1       = sbuf1;
1639     smatis1->ptr         = ptr;
1640     smatis1->tmp         = tmp;
1641     smatis1->ctr         = ctr;
1642 
1643     smatis1->pa           = pa;
1644     smatis1->req_size     = req_size;
1645     smatis1->req_source1  = req_source1;
1646 
1647     smatis1->allcolumns  = allcolumns;
1648     smatis1->singleis    = PETSC_TRUE;
1649     smatis1->row2proc    = row2proc;
1650     smatis1->rmap        = rmap;
1651     smatis1->cmap        = cmap;
1652 #if defined(PETSC_USE_CTABLE)
1653     smatis1->rmap_loc    = rmap_loc;
1654     smatis1->cmap_loc    = cmap_loc;
1655 #endif
1656 
1657     smatis1->destroy     = submat->ops->destroy;
1658     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1659     submat->factortype   = C->factortype;
1660 
1661     /* compute rmax */
1662     rmax = 0;
1663     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1664 
1665   } else { /* scall == MAT_REUSE_MATRIX */
1666     submat = submats[0];
1667     PetscCheckFalse(submat->rmap->n != nrow || submat->cmap->n != ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1668 
1669     subc    = (Mat_SeqAIJ*)submat->data;
1670     rmax    = subc->rmax;
1671     smatis1 = subc->submatis1;
1672     nrqs        = smatis1->nrqs;
1673     nrqr        = smatis1->nrqr;
1674     rbuf1       = smatis1->rbuf1;
1675     rbuf2       = smatis1->rbuf2;
1676     rbuf3       = smatis1->rbuf3;
1677     req_source2 = smatis1->req_source2;
1678 
1679     sbuf1     = smatis1->sbuf1;
1680     sbuf2     = smatis1->sbuf2;
1681     ptr       = smatis1->ptr;
1682     tmp       = smatis1->tmp;
1683     ctr       = smatis1->ctr;
1684 
1685     pa         = smatis1->pa;
1686     req_size   = smatis1->req_size;
1687     req_source1 = smatis1->req_source1;
1688 
1689     allcolumns = smatis1->allcolumns;
1690     row2proc   = smatis1->row2proc;
1691     rmap       = smatis1->rmap;
1692     cmap       = smatis1->cmap;
1693 #if defined(PETSC_USE_CTABLE)
1694     rmap_loc   = smatis1->rmap_loc;
1695     cmap_loc   = smatis1->cmap_loc;
1696 #endif
1697   }
1698 
1699   /* Post recv matrix values */
1700   ierr = PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1701   ierr = PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);CHKERRQ(ierr);
1702   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1703   for (i=0; i<nrqs; ++i) {
1704     ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr);
1705     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
1706   }
1707 
1708   /* Allocate sending buffers for a->a, and send them off */
1709   ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr);
1710   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1711   if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);}
1712   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1713 
1714   for (i=0; i<nrqr; i++) {
1715     rbuf1_i   = rbuf1[i];
1716     sbuf_aa_i = sbuf_aa[i];
1717     ct1       = 2*rbuf1_i[0]+1;
1718     ct2       = 0;
1719     /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1720 
1721     kmax = rbuf1_i[2];
1722     for (k=0; k<kmax; k++,ct1++) {
1723       row = rbuf1_i[ct1] - rstart;
1724       nzA = ai[row+1] - ai[row];
1725       nzB = bi[row+1] - bi[row];
1726       ncols  = nzA + nzB;
1727       cworkB = bj + bi[row];
1728       vworkA = a_a + ai[row];
1729       vworkB = b_a + bi[row];
1730 
1731       /* load the column values for this row into vals*/
1732       vals = sbuf_aa_i + ct2;
1733 
1734       lwrite = 0;
1735       for (l=0; l<nzB; l++) {
1736         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1737       }
1738       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1739       for (l=0; l<nzB; l++) {
1740         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1741       }
1742 
1743       ct2 += ncols;
1744     }
1745     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
1746   }
1747 
1748   /* Assemble submat */
1749   /* First assemble the local rows */
1750   for (j=0; j<nrow; j++) {
1751     row  = irow[j];
1752     proc = row2proc[j];
1753     if (proc == rank) {
1754       Crow = row - rstart;  /* local row index of C */
1755 #if defined(PETSC_USE_CTABLE)
1756       row = rmap_loc[Crow]; /* row index of submat */
1757 #else
1758       row = rmap[row];
1759 #endif
1760 
1761       if (allcolumns) {
1762         /* diagonal part A = c->A */
1763         ncols = ai[Crow+1] - ai[Crow];
1764         cols  = aj + ai[Crow];
1765         vals  = a_a + ai[Crow];
1766         i     = 0;
1767         for (k=0; k<ncols; k++) {
1768           subcols[i]   = cols[k] + cstart;
1769           subvals[i++] = vals[k];
1770         }
1771 
1772         /* off-diagonal part B = c->B */
1773         ncols = bi[Crow+1] - bi[Crow];
1774         cols  = bj + bi[Crow];
1775         vals  = b_a + bi[Crow];
1776         for (k=0; k<ncols; k++) {
1777           subcols[i]   = bmap[cols[k]];
1778           subvals[i++] = vals[k];
1779         }
1780 
1781         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1782 
1783       } else { /* !allcolumns */
1784 #if defined(PETSC_USE_CTABLE)
1785         /* diagonal part A = c->A */
1786         ncols = ai[Crow+1] - ai[Crow];
1787         cols  = aj + ai[Crow];
1788         vals  = a_a + ai[Crow];
1789         i     = 0;
1790         for (k=0; k<ncols; k++) {
1791           tcol = cmap_loc[cols[k]];
1792           if (tcol) {
1793             subcols[i]   = --tcol;
1794             subvals[i++] = vals[k];
1795           }
1796         }
1797 
1798         /* off-diagonal part B = c->B */
1799         ncols = bi[Crow+1] - bi[Crow];
1800         cols  = bj + bi[Crow];
1801         vals  = b_a + bi[Crow];
1802         for (k=0; k<ncols; k++) {
1803           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1804           if (tcol) {
1805             subcols[i]   = --tcol;
1806             subvals[i++] = vals[k];
1807           }
1808         }
1809 #else
1810         /* diagonal part A = c->A */
1811         ncols = ai[Crow+1] - ai[Crow];
1812         cols  = aj + ai[Crow];
1813         vals  = a_a + ai[Crow];
1814         i     = 0;
1815         for (k=0; k<ncols; k++) {
1816           tcol = cmap[cols[k]+cstart];
1817           if (tcol) {
1818             subcols[i]   = --tcol;
1819             subvals[i++] = vals[k];
1820           }
1821         }
1822 
1823         /* off-diagonal part B = c->B */
1824         ncols = bi[Crow+1] - bi[Crow];
1825         cols  = bj + bi[Crow];
1826         vals  = b_a + bi[Crow];
1827         for (k=0; k<ncols; k++) {
1828           tcol = cmap[bmap[cols[k]]];
1829           if (tcol) {
1830             subcols[i]   = --tcol;
1831             subvals[i++] = vals[k];
1832           }
1833         }
1834 #endif
1835         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1836       }
1837     }
1838   }
1839 
1840   /* Now assemble the off-proc rows */
1841   for (i=0; i<nrqs; i++) { /* for each requested message */
1842     /* recv values from other processes */
1843     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr);
1844     proc    = pa[idex];
1845     sbuf1_i = sbuf1[proc];
1846     /* jmax    = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1847     ct1     = 2 + 1;
1848     ct2     = 0; /* count of received C->j */
1849     ct3     = 0; /* count of received C->j that will be inserted into submat */
1850     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1851     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1852     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1853 
1854     /* is_no = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1855     max1 = sbuf1_i[2];             /* num of rows */
1856     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1857       row = sbuf1_i[ct1]; /* row index of submat */
1858       if (!allcolumns) {
1859         idex = 0;
1860         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1861           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1862           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1863 #if defined(PETSC_USE_CTABLE)
1864             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1865               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1866             } else {
1867               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1868             }
1869 #else
1870             tcol = cmap[rbuf3_i[ct2]];
1871 #endif
1872             if (tcol) {
1873               subcols[idex]   = --tcol; /* may not be sorted */
1874               subvals[idex++] = rbuf4_i[ct2];
1875 
1876               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1877                For reuse, we replace received C->j with index that should be inserted to submat */
1878               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1879             }
1880           }
1881           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1882         } else { /* scall == MAT_REUSE_MATRIX */
1883           submat = submats[0];
1884           subc   = (Mat_SeqAIJ*)submat->data;
1885 
1886           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1887           for (l=0; l<nnz; l++) {
1888             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1889             subvals[idex++] = rbuf4_i[ct2];
1890           }
1891 
1892           bj = subc->j + subc->i[row]; /* sorted column indices */
1893           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1894         }
1895       } else { /* allcolumns */
1896         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1897         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1898         ct2 += nnz;
1899       }
1900     }
1901   }
1902 
1903   /* sending a->a are done */
1904   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);
1905   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1906 
1907   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1908   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1909   submats[0] = submat;
1910 
1911   /* Restore the indices */
1912   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1913   if (!allcolumns) {
1914     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1915   }
1916 
1917   /* Destroy allocated memory */
1918   for (i=0; i<nrqs; ++i) {
1919     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1920   }
1921   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1922   if (sbuf_aa) {
1923     ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1924     ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1925   }
1926 
1927   if (scall == MAT_INITIAL_MATRIX) {
1928     ierr = PetscFree(lens);CHKERRQ(ierr);
1929     if (sbuf_aj) {
1930       ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1931       ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1932     }
1933   }
1934   ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1935   ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1936   PetscFunctionReturn(0);
1937 }
1938 
1939 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1940 {
1941   PetscErrorCode ierr;
1942   PetscInt       ncol;
1943   PetscBool      colflag,allcolumns=PETSC_FALSE;
1944 
1945   PetscFunctionBegin;
1946   /* Allocate memory to hold all the submatrices */
1947   if (scall == MAT_INITIAL_MATRIX) {
1948     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1949   }
1950 
1951   /* Check for special case: each processor gets entire matrix columns */
1952   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1953   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1954   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1955 
1956   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1957   PetscFunctionReturn(0);
1958 }
1959 
1960 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1961 {
1962   PetscErrorCode ierr;
1963   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1964   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1965   Mat_SeqAIJ     *subc;
1966   Mat_SubSppt    *smat;
1967 
1968   PetscFunctionBegin;
1969   /* Check for special case: each processor has a single IS */
1970   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1971     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1972     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1973     PetscFunctionReturn(0);
1974   }
1975 
1976   /* Collect global wantallmatrix and nstages */
1977   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1978   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1979   if (!nmax) nmax = 1;
1980 
1981   if (scall == MAT_INITIAL_MATRIX) {
1982     /* Collect global wantallmatrix and nstages */
1983     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1984       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1985       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1986       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
1987       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
1988       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1989         wantallmatrix = PETSC_TRUE;
1990 
1991         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
1992       }
1993     }
1994 
1995     /* Determine the number of stages through which submatrices are done
1996        Each stage will extract nmax submatrices.
1997        nmax is determined by the matrix column dimension.
1998        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1999     */
2000     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2001 
2002     in[0] = -1*(PetscInt)wantallmatrix;
2003     in[1] = nstages;
2004     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
2005     wantallmatrix = (PetscBool)(-out[0]);
2006     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2007 
2008   } else { /* MAT_REUSE_MATRIX */
2009     if (ismax) {
2010       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2011       smat = subc->submatis1;
2012     } else { /* (*submat)[0] is a dummy matrix */
2013       smat = (Mat_SubSppt*)(*submat)[0]->data;
2014     }
2015     if (!smat) {
2016       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2017       wantallmatrix = PETSC_TRUE;
2018     } else if (smat->singleis) {
2019       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2020       PetscFunctionReturn(0);
2021     } else {
2022       nstages = smat->nstages;
2023     }
2024   }
2025 
2026   if (wantallmatrix) {
2027     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2028     PetscFunctionReturn(0);
2029   }
2030 
2031   /* Allocate memory to hold all the submatrices and dummy submatrices */
2032   if (scall == MAT_INITIAL_MATRIX) {
2033     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2034   }
2035 
2036   for (i=0,pos=0; i<nstages; i++) {
2037     if (pos+nmax <= ismax) max_no = nmax;
2038     else if (pos >= ismax) max_no = 0;
2039     else                   max_no = ismax-pos;
2040 
2041     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2042     if (!max_no) {
2043       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2044         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2045         smat->nstages = nstages;
2046       }
2047       pos++; /* advance to next dummy matrix if any */
2048     } else pos += max_no;
2049   }
2050 
2051   if (ismax && scall == MAT_INITIAL_MATRIX) {
2052     /* save nstages for reuse */
2053     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2054     smat = subc->submatis1;
2055     smat->nstages = nstages;
2056   }
2057   PetscFunctionReturn(0);
2058 }
2059 
2060 /* -------------------------------------------------------------------------*/
2061 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2062 {
2063   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2064   Mat            A  = c->A;
2065   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2066   const PetscInt **icol,**irow;
2067   PetscInt       *nrow,*ncol,start;
2068   PetscErrorCode ierr;
2069   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2070   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2071   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2072   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2073   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2074 #if defined(PETSC_USE_CTABLE)
2075   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2076 #else
2077   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2078 #endif
2079   const PetscInt *irow_i;
2080   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2081   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2082   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2083   MPI_Comm       comm;
2084   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2085   PetscMPIInt    *onodes1,*olengths1,end;
2086   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2087   Mat_SubSppt    *smat_i;
2088   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2089   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2090 
2091   PetscFunctionBegin;
2092   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2093   size = c->size;
2094   rank = c->rank;
2095 
2096   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2097   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2098 
2099   for (i=0; i<ismax; i++) {
2100     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2101     if (!issorted[i]) iscsorted = issorted[i];
2102 
2103     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2104 
2105     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2106     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2107 
2108     /* Check for special case: allcolumn */
2109     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2110     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2111     if (colflag && ncol[i] == C->cmap->N) {
2112       allcolumns[i] = PETSC_TRUE;
2113       icol[i] = NULL;
2114     } else {
2115       allcolumns[i] = PETSC_FALSE;
2116       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2117     }
2118   }
2119 
2120   if (scall == MAT_REUSE_MATRIX) {
2121     /* Assumes new rows are same length as the old rows */
2122     for (i=0; i<ismax; i++) {
2123       PetscCheckFalse(!submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i);
2124       subc = (Mat_SeqAIJ*)submats[i]->data;
2125       PetscCheckFalse((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2126 
2127       /* Initial matrix as if empty */
2128       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2129 
2130       smat_i   = subc->submatis1;
2131 
2132       nrqs        = smat_i->nrqs;
2133       nrqr        = smat_i->nrqr;
2134       rbuf1       = smat_i->rbuf1;
2135       rbuf2       = smat_i->rbuf2;
2136       rbuf3       = smat_i->rbuf3;
2137       req_source2 = smat_i->req_source2;
2138 
2139       sbuf1     = smat_i->sbuf1;
2140       sbuf2     = smat_i->sbuf2;
2141       ptr       = smat_i->ptr;
2142       tmp       = smat_i->tmp;
2143       ctr       = smat_i->ctr;
2144 
2145       pa          = smat_i->pa;
2146       req_size    = smat_i->req_size;
2147       req_source1 = smat_i->req_source1;
2148 
2149       allcolumns[i] = smat_i->allcolumns;
2150       row2proc[i]   = smat_i->row2proc;
2151       rmap[i]       = smat_i->rmap;
2152       cmap[i]       = smat_i->cmap;
2153     }
2154 
2155     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2156       PetscCheckFalse(!submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2157       smat_i = (Mat_SubSppt*)submats[0]->data;
2158 
2159       nrqs        = smat_i->nrqs;
2160       nrqr        = smat_i->nrqr;
2161       rbuf1       = smat_i->rbuf1;
2162       rbuf2       = smat_i->rbuf2;
2163       rbuf3       = smat_i->rbuf3;
2164       req_source2 = smat_i->req_source2;
2165 
2166       sbuf1       = smat_i->sbuf1;
2167       sbuf2       = smat_i->sbuf2;
2168       ptr         = smat_i->ptr;
2169       tmp         = smat_i->tmp;
2170       ctr         = smat_i->ctr;
2171 
2172       pa          = smat_i->pa;
2173       req_size    = smat_i->req_size;
2174       req_source1 = smat_i->req_source1;
2175 
2176       allcolumns[0] = PETSC_FALSE;
2177     }
2178   } else { /* scall == MAT_INITIAL_MATRIX */
2179     /* Get some new tags to keep the communication clean */
2180     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2181     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2182 
2183     /* evaluate communication - mesg to who, length of mesg, and buffer space
2184      required. Based on this, buffers are allocated, and data copied into them*/
2185     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2186 
2187     for (i=0; i<ismax; i++) {
2188       jmax   = nrow[i];
2189       irow_i = irow[i];
2190 
2191       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2192       row2proc[i] = row2proc_i;
2193 
2194       if (issorted[i]) proc = 0;
2195       for (j=0; j<jmax; j++) {
2196         if (!issorted[i]) proc = 0;
2197         row = irow_i[j];
2198         while (row >= C->rmap->range[proc+1]) proc++;
2199         w4[proc]++;
2200         row2proc_i[j] = proc; /* map row index to proc */
2201       }
2202       for (j=0; j<size; j++) {
2203         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2204       }
2205     }
2206 
2207     nrqs     = 0;              /* no of outgoing messages */
2208     msz      = 0;              /* total mesg length (for all procs) */
2209     w1[rank] = 0;              /* no mesg sent to self */
2210     w3[rank] = 0;
2211     for (i=0; i<size; i++) {
2212       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2213     }
2214     ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/
2215     for (i=0,j=0; i<size; i++) {
2216       if (w1[i]) { pa[j] = i; j++; }
2217     }
2218 
2219     /* Each message would have a header = 1 + 2*(no of IS) + data */
2220     for (i=0; i<nrqs; i++) {
2221       j      = pa[i];
2222       w1[j] += w2[j] + 2* w3[j];
2223       msz   += w1[j];
2224     }
2225     ierr = PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz);CHKERRQ(ierr);
2226 
2227     /* Determine the number of messages to expect, their lengths, from from-ids */
2228     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2229     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2230 
2231     /* Now post the Irecvs corresponding to these messages */
2232     ierr = PetscObjectGetNewTag((PetscObject)C,&tag0);CHKERRQ(ierr);
2233     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2234 
2235     /* Allocate Memory for outgoing messages */
2236     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2237     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2238     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2239 
2240     {
2241       PetscInt *iptr = tmp;
2242       k    = 0;
2243       for (i=0; i<nrqs; i++) {
2244         j        = pa[i];
2245         iptr    += k;
2246         sbuf1[j] = iptr;
2247         k        = w1[j];
2248       }
2249     }
2250 
2251     /* Form the outgoing messages. Initialize the header space */
2252     for (i=0; i<nrqs; i++) {
2253       j           = pa[i];
2254       sbuf1[j][0] = 0;
2255       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2256       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2257     }
2258 
2259     /* Parse the isrow and copy data into outbuf */
2260     for (i=0; i<ismax; i++) {
2261       row2proc_i = row2proc[i];
2262       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2263       irow_i = irow[i];
2264       jmax   = nrow[i];
2265       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2266         proc = row2proc_i[j];
2267         if (proc != rank) { /* copy to the outgoing buf*/
2268           ctr[proc]++;
2269           *ptr[proc] = irow_i[j];
2270           ptr[proc]++;
2271         }
2272       }
2273       /* Update the headers for the current IS */
2274       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2275         if ((ctr_j = ctr[j])) {
2276           sbuf1_j        = sbuf1[j];
2277           k              = ++sbuf1_j[0];
2278           sbuf1_j[2*k]   = ctr_j;
2279           sbuf1_j[2*k-1] = i;
2280         }
2281       }
2282     }
2283 
2284     /*  Now  post the sends */
2285     ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
2286     for (i=0; i<nrqs; ++i) {
2287       j    = pa[i];
2288       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr);
2289     }
2290 
2291     /* Post Receives to capture the buffer size */
2292     ierr = PetscMalloc1(nrqs,&r_waits2);CHKERRQ(ierr);
2293     ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr);
2294     if (nrqs) rbuf2[0] = tmp + msz;
2295     for (i=1; i<nrqs; ++i) {
2296       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2297     }
2298     for (i=0; i<nrqs; ++i) {
2299       j    = pa[i];
2300       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
2301     }
2302 
2303     /* Send to other procs the buf size they should allocate */
2304     /* Receive messages*/
2305     ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr);
2306     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2307     {
2308       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2309       PetscInt   *sbuf2_i;
2310 
2311       ierr = MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2312       for (i=0; i<nrqr; ++i) {
2313         req_size[i] = 0;
2314         rbuf1_i        = rbuf1[i];
2315         start          = 2*rbuf1_i[0] + 1;
2316         end            = olengths1[i];
2317         ierr           = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr);
2318         sbuf2_i        = sbuf2[i];
2319         for (j=start; j<end; j++) {
2320           id              = rbuf1_i[j] - rstart;
2321           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2322           sbuf2_i[j]      = ncols;
2323           req_size[i] += ncols;
2324         }
2325         req_source1[i] = onodes1[i];
2326         /* form the header */
2327         sbuf2_i[0] = req_size[i];
2328         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2329 
2330         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
2331       }
2332     }
2333 
2334     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2335     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2336     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2337     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2338 
2339     /* Receive messages*/
2340     ierr = PetscMalloc1(nrqs,&r_waits3);CHKERRQ(ierr);
2341     ierr = MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2342     for (i=0; i<nrqs; ++i) {
2343       ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr);
2344       req_source2[i] = pa[i];
2345       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
2346     }
2347     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2348 
2349     /* Wait on sends1 and sends2 */
2350     ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2351     ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2352     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2353     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2354 
2355     /* Now allocate sending buffers for a->j, and send them off */
2356     ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr);
2357     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2358     if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);}
2359     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2360 
2361     ierr = PetscMalloc1(nrqr,&s_waits3);CHKERRQ(ierr);
2362     {
2363       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2364       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2365       PetscInt cend = C->cmap->rend;
2366       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2367 
2368       for (i=0; i<nrqr; i++) {
2369         rbuf1_i   = rbuf1[i];
2370         sbuf_aj_i = sbuf_aj[i];
2371         ct1       = 2*rbuf1_i[0] + 1;
2372         ct2       = 0;
2373         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2374           kmax = rbuf1[i][2*j];
2375           for (k=0; k<kmax; k++,ct1++) {
2376             row    = rbuf1_i[ct1] - rstart;
2377             nzA    = a_i[row+1] - a_i[row];
2378             nzB    = b_i[row+1] - b_i[row];
2379             ncols  = nzA + nzB;
2380             cworkA = a_j + a_i[row];
2381             cworkB = b_j + b_i[row];
2382 
2383             /* load the column indices for this row into cols */
2384             cols = sbuf_aj_i + ct2;
2385 
2386             lwrite = 0;
2387             for (l=0; l<nzB; l++) {
2388               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2389             }
2390             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2391             for (l=0; l<nzB; l++) {
2392               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2393             }
2394 
2395             ct2 += ncols;
2396           }
2397         }
2398         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
2399       }
2400     }
2401 
2402     /* create col map: global col of C -> local col of submatrices */
2403     {
2404       const PetscInt *icol_i;
2405 #if defined(PETSC_USE_CTABLE)
2406       for (i=0; i<ismax; i++) {
2407         if (!allcolumns[i]) {
2408           ierr = PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2409 
2410           jmax   = ncol[i];
2411           icol_i = icol[i];
2412           cmap_i = cmap[i];
2413           for (j=0; j<jmax; j++) {
2414             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2415           }
2416         } else cmap[i] = NULL;
2417       }
2418 #else
2419       for (i=0; i<ismax; i++) {
2420         if (!allcolumns[i]) {
2421           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2422           jmax   = ncol[i];
2423           icol_i = icol[i];
2424           cmap_i = cmap[i];
2425           for (j=0; j<jmax; j++) {
2426             cmap_i[icol_i[j]] = j+1;
2427           }
2428         } else cmap[i] = NULL;
2429       }
2430 #endif
2431     }
2432 
2433     /* Create lens which is required for MatCreate... */
2434     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2435     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2436 
2437     if (ismax) {
2438       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2439     }
2440     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2441 
2442     /* Update lens from local data */
2443     for (i=0; i<ismax; i++) {
2444       row2proc_i = row2proc[i];
2445       jmax = nrow[i];
2446       if (!allcolumns[i]) cmap_i = cmap[i];
2447       irow_i = irow[i];
2448       lens_i = lens[i];
2449       for (j=0; j<jmax; j++) {
2450         row = irow_i[j];
2451         proc = row2proc_i[j];
2452         if (proc == rank) {
2453           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2454           if (!allcolumns[i]) {
2455             for (k=0; k<ncols; k++) {
2456 #if defined(PETSC_USE_CTABLE)
2457               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2458 #else
2459               tcol = cmap_i[cols[k]];
2460 #endif
2461               if (tcol) lens_i[j]++;
2462             }
2463           } else { /* allcolumns */
2464             lens_i[j] = ncols;
2465           }
2466           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2467         }
2468       }
2469     }
2470 
2471     /* Create row map: global row of C -> local row of submatrices */
2472 #if defined(PETSC_USE_CTABLE)
2473     for (i=0; i<ismax; i++) {
2474       ierr   = PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2475       irow_i = irow[i];
2476       jmax   = nrow[i];
2477       for (j=0; j<jmax; j++) {
2478       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2479       }
2480     }
2481 #else
2482     for (i=0; i<ismax; i++) {
2483       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2484       rmap_i = rmap[i];
2485       irow_i = irow[i];
2486       jmax   = nrow[i];
2487       for (j=0; j<jmax; j++) {
2488         rmap_i[irow_i[j]] = j;
2489       }
2490     }
2491 #endif
2492 
2493     /* Update lens from offproc data */
2494     {
2495       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2496 
2497       ierr    = MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2498       for (tmp2=0; tmp2<nrqs; tmp2++) {
2499         sbuf1_i = sbuf1[pa[tmp2]];
2500         jmax    = sbuf1_i[0];
2501         ct1     = 2*jmax+1;
2502         ct2     = 0;
2503         rbuf2_i = rbuf2[tmp2];
2504         rbuf3_i = rbuf3[tmp2];
2505         for (j=1; j<=jmax; j++) {
2506           is_no  = sbuf1_i[2*j-1];
2507           max1   = sbuf1_i[2*j];
2508           lens_i = lens[is_no];
2509           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2510           rmap_i = rmap[is_no];
2511           for (k=0; k<max1; k++,ct1++) {
2512 #if defined(PETSC_USE_CTABLE)
2513             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2514             row--;
2515             PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2516 #else
2517             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2518 #endif
2519             max2 = rbuf2_i[ct1];
2520             for (l=0; l<max2; l++,ct2++) {
2521               if (!allcolumns[is_no]) {
2522 #if defined(PETSC_USE_CTABLE)
2523                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2524 #else
2525                 tcol = cmap_i[rbuf3_i[ct2]];
2526 #endif
2527                 if (tcol) lens_i[row]++;
2528               } else { /* allcolumns */
2529                 lens_i[row]++; /* lens_i[row] += max2 ? */
2530               }
2531             }
2532           }
2533         }
2534       }
2535     }
2536     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2537     ierr = MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2538     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2539 
2540     /* Create the submatrices */
2541     for (i=0; i<ismax; i++) {
2542       PetscInt    rbs,cbs;
2543 
2544       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2545       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2546 
2547       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2548       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2549 
2550       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2551       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2552       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2553 
2554       /* create struct Mat_SubSppt and attached it to submat */
2555       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2556       subc = (Mat_SeqAIJ*)submats[i]->data;
2557       subc->submatis1 = smat_i;
2558 
2559       smat_i->destroy          = submats[i]->ops->destroy;
2560       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2561       submats[i]->factortype   = C->factortype;
2562 
2563       smat_i->id          = i;
2564       smat_i->nrqs        = nrqs;
2565       smat_i->nrqr        = nrqr;
2566       smat_i->rbuf1       = rbuf1;
2567       smat_i->rbuf2       = rbuf2;
2568       smat_i->rbuf3       = rbuf3;
2569       smat_i->sbuf2       = sbuf2;
2570       smat_i->req_source2 = req_source2;
2571 
2572       smat_i->sbuf1       = sbuf1;
2573       smat_i->ptr         = ptr;
2574       smat_i->tmp         = tmp;
2575       smat_i->ctr         = ctr;
2576 
2577       smat_i->pa           = pa;
2578       smat_i->req_size     = req_size;
2579       smat_i->req_source1  = req_source1;
2580 
2581       smat_i->allcolumns  = allcolumns[i];
2582       smat_i->singleis    = PETSC_FALSE;
2583       smat_i->row2proc    = row2proc[i];
2584       smat_i->rmap        = rmap[i];
2585       smat_i->cmap        = cmap[i];
2586     }
2587 
2588     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2589       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2590       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2591       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2592 
2593       /* create struct Mat_SubSppt and attached it to submat */
2594       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2595       submats[0]->data = (void*)smat_i;
2596 
2597       smat_i->destroy          = submats[0]->ops->destroy;
2598       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2599       submats[0]->factortype   = C->factortype;
2600 
2601       smat_i->id          = 0;
2602       smat_i->nrqs        = nrqs;
2603       smat_i->nrqr        = nrqr;
2604       smat_i->rbuf1       = rbuf1;
2605       smat_i->rbuf2       = rbuf2;
2606       smat_i->rbuf3       = rbuf3;
2607       smat_i->sbuf2       = sbuf2;
2608       smat_i->req_source2 = req_source2;
2609 
2610       smat_i->sbuf1       = sbuf1;
2611       smat_i->ptr         = ptr;
2612       smat_i->tmp         = tmp;
2613       smat_i->ctr         = ctr;
2614 
2615       smat_i->pa           = pa;
2616       smat_i->req_size     = req_size;
2617       smat_i->req_source1  = req_source1;
2618 
2619       smat_i->allcolumns  = PETSC_FALSE;
2620       smat_i->singleis    = PETSC_FALSE;
2621       smat_i->row2proc    = NULL;
2622       smat_i->rmap        = NULL;
2623       smat_i->cmap        = NULL;
2624     }
2625 
2626     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2627     ierr = PetscFree(lens);CHKERRQ(ierr);
2628     if (sbuf_aj) {
2629       ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2630       ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2631     }
2632 
2633   } /* endof scall == MAT_INITIAL_MATRIX */
2634 
2635   /* Post recv matrix values */
2636   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2637   ierr = PetscMalloc1(nrqs,&rbuf4);CHKERRQ(ierr);
2638   ierr = PetscMalloc1(nrqs,&r_waits4);CHKERRQ(ierr);
2639   for (i=0; i<nrqs; ++i) {
2640     ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr);
2641     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
2642   }
2643 
2644   /* Allocate sending buffers for a->a, and send them off */
2645   ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr);
2646   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2647   if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);}
2648   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2649 
2650   ierr = PetscMalloc1(nrqr,&s_waits4);CHKERRQ(ierr);
2651   {
2652     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2653     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2654     PetscInt    cend   = C->cmap->rend;
2655     PetscInt    *b_j   = b->j;
2656     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2657 
2658     ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2659     ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2660     for (i=0; i<nrqr; i++) {
2661       rbuf1_i   = rbuf1[i];
2662       sbuf_aa_i = sbuf_aa[i];
2663       ct1       = 2*rbuf1_i[0]+1;
2664       ct2       = 0;
2665       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2666         kmax = rbuf1_i[2*j];
2667         for (k=0; k<kmax; k++,ct1++) {
2668           row    = rbuf1_i[ct1] - rstart;
2669           nzA    = a_i[row+1] - a_i[row];
2670           nzB    = b_i[row+1] - b_i[row];
2671           ncols  = nzA + nzB;
2672           cworkB = b_j + b_i[row];
2673           vworkA = a_a + a_i[row];
2674           vworkB = b_a + b_i[row];
2675 
2676           /* load the column values for this row into vals*/
2677           vals = sbuf_aa_i+ct2;
2678 
2679           lwrite = 0;
2680           for (l=0; l<nzB; l++) {
2681             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2682           }
2683           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2684           for (l=0; l<nzB; l++) {
2685             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2686           }
2687 
2688           ct2 += ncols;
2689         }
2690       }
2691       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
2692     }
2693     ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2694     ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2695   }
2696 
2697   /* Assemble the matrices */
2698   /* First assemble the local rows */
2699   for (i=0; i<ismax; i++) {
2700     row2proc_i = row2proc[i];
2701     subc      = (Mat_SeqAIJ*)submats[i]->data;
2702     imat_ilen = subc->ilen;
2703     imat_j    = subc->j;
2704     imat_i    = subc->i;
2705     imat_a    = subc->a;
2706 
2707     if (!allcolumns[i]) cmap_i = cmap[i];
2708     rmap_i = rmap[i];
2709     irow_i = irow[i];
2710     jmax   = nrow[i];
2711     for (j=0; j<jmax; j++) {
2712       row  = irow_i[j];
2713       proc = row2proc_i[j];
2714       if (proc == rank) {
2715         old_row = row;
2716 #if defined(PETSC_USE_CTABLE)
2717         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2718         row--;
2719 #else
2720         row = rmap_i[row];
2721 #endif
2722         ilen_row = imat_ilen[row];
2723         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2724         mat_i    = imat_i[row];
2725         mat_a    = imat_a + mat_i;
2726         mat_j    = imat_j + mat_i;
2727         if (!allcolumns[i]) {
2728           for (k=0; k<ncols; k++) {
2729 #if defined(PETSC_USE_CTABLE)
2730             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2731 #else
2732             tcol = cmap_i[cols[k]];
2733 #endif
2734             if (tcol) {
2735               *mat_j++ = tcol - 1;
2736               *mat_a++ = vals[k];
2737               ilen_row++;
2738             }
2739           }
2740         } else { /* allcolumns */
2741           for (k=0; k<ncols; k++) {
2742             *mat_j++ = cols[k];  /* global col index! */
2743             *mat_a++ = vals[k];
2744             ilen_row++;
2745           }
2746         }
2747         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2748 
2749         imat_ilen[row] = ilen_row;
2750       }
2751     }
2752   }
2753 
2754   /* Now assemble the off proc rows */
2755   ierr    = MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2756   for (tmp2=0; tmp2<nrqs; tmp2++) {
2757     sbuf1_i = sbuf1[pa[tmp2]];
2758     jmax    = sbuf1_i[0];
2759     ct1     = 2*jmax + 1;
2760     ct2     = 0;
2761     rbuf2_i = rbuf2[tmp2];
2762     rbuf3_i = rbuf3[tmp2];
2763     rbuf4_i = rbuf4[tmp2];
2764     for (j=1; j<=jmax; j++) {
2765       is_no     = sbuf1_i[2*j-1];
2766       rmap_i    = rmap[is_no];
2767       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2768       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2769       imat_ilen = subc->ilen;
2770       imat_j    = subc->j;
2771       imat_i    = subc->i;
2772       imat_a    = subc->a;
2773       max1      = sbuf1_i[2*j];
2774       for (k=0; k<max1; k++,ct1++) {
2775         row = sbuf1_i[ct1];
2776 #if defined(PETSC_USE_CTABLE)
2777         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2778         row--;
2779 #else
2780         row = rmap_i[row];
2781 #endif
2782         ilen  = imat_ilen[row];
2783         mat_i = imat_i[row];
2784         mat_a = imat_a + mat_i;
2785         mat_j = imat_j + mat_i;
2786         max2  = rbuf2_i[ct1];
2787         if (!allcolumns[is_no]) {
2788           for (l=0; l<max2; l++,ct2++) {
2789 #if defined(PETSC_USE_CTABLE)
2790             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2791 #else
2792             tcol = cmap_i[rbuf3_i[ct2]];
2793 #endif
2794             if (tcol) {
2795               *mat_j++ = tcol - 1;
2796               *mat_a++ = rbuf4_i[ct2];
2797               ilen++;
2798             }
2799           }
2800         } else { /* allcolumns */
2801           for (l=0; l<max2; l++,ct2++) {
2802             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2803             *mat_a++ = rbuf4_i[ct2];
2804             ilen++;
2805           }
2806         }
2807         imat_ilen[row] = ilen;
2808       }
2809     }
2810   }
2811 
2812   if (!iscsorted) { /* sort column indices of the rows */
2813     for (i=0; i<ismax; i++) {
2814       subc      = (Mat_SeqAIJ*)submats[i]->data;
2815       imat_j    = subc->j;
2816       imat_i    = subc->i;
2817       imat_a    = subc->a;
2818       imat_ilen = subc->ilen;
2819 
2820       if (allcolumns[i]) continue;
2821       jmax = nrow[i];
2822       for (j=0; j<jmax; j++) {
2823         mat_i = imat_i[j];
2824         mat_a = imat_a + mat_i;
2825         mat_j = imat_j + mat_i;
2826         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2827       }
2828     }
2829   }
2830 
2831   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2832   ierr = MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2833   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2834 
2835   /* Restore the indices */
2836   for (i=0; i<ismax; i++) {
2837     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2838     if (!allcolumns[i]) {
2839       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2840     }
2841   }
2842 
2843   for (i=0; i<ismax; i++) {
2844     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2845     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2846   }
2847 
2848   /* Destroy allocated memory */
2849   if (sbuf_aa) {
2850     ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2851     ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2852   }
2853   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2854 
2855   for (i=0; i<nrqs; ++i) {
2856     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2857   }
2858   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2859 
2860   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2861   PetscFunctionReturn(0);
2862 }
2863 
2864 /*
2865  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2866  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2867  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2868  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2869  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2870  state, and needs to be "assembled" later by compressing B's column space.
2871 
2872  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2873  Following this call, C->A & C->B have been created, even if empty.
2874  */
2875 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2876 {
2877   /* If making this function public, change the error returned in this function away from _PLIB. */
2878   PetscErrorCode ierr;
2879   Mat_MPIAIJ     *aij;
2880   Mat_SeqAIJ     *Baij;
2881   PetscBool      seqaij,Bdisassembled;
2882   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2883   PetscScalar    v;
2884   const PetscInt *rowindices,*colindices;
2885 
2886   PetscFunctionBegin;
2887   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2888   if (A) {
2889     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2890     PetscCheckFalse(!seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2891     if (rowemb) {
2892       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2893       PetscCheckFalse(m != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2894     } else {
2895       if (C->rmap->n != A->rmap->n) {
2896         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2897       }
2898     }
2899     if (dcolemb) {
2900       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2901       PetscCheckFalse(n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n);
2902     } else {
2903       PetscCheckFalse(C->cmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2904     }
2905   }
2906   if (B) {
2907     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2908     PetscCheckFalse(!seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2909     if (rowemb) {
2910       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2911       PetscCheckFalse(m != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2912     } else {
2913       if (C->rmap->n != B->rmap->n) {
2914         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2915       }
2916     }
2917     if (ocolemb) {
2918       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2919       PetscCheckFalse(n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n);
2920     } else {
2921       PetscCheckFalse(C->cmap->N - C->cmap->n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2922     }
2923   }
2924 
2925   aij = (Mat_MPIAIJ*)C->data;
2926   if (!aij->A) {
2927     /* Mimic parts of MatMPIAIJSetPreallocation() */
2928     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2929     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2930     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2931     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2932     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2933   }
2934   if (A) {
2935     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2936   } else {
2937     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2938   }
2939   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2940     /*
2941       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2942       need to "disassemble" B -- convert it to using C's global indices.
2943       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2944 
2945       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2946       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2947 
2948       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2949       At least avoid calling MatSetValues() and the implied searches?
2950     */
2951 
2952     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2953 #if defined(PETSC_USE_CTABLE)
2954       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2955 #else
2956       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2957       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2958       if (aij->B) {
2959         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2960       }
2961 #endif
2962       ngcol = 0;
2963       if (aij->lvec) {
2964         ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2965       }
2966       if (aij->garray) {
2967         ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2968         ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2969       }
2970       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2971       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2972     }
2973     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2974       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2975     }
2976     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2977       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2978     }
2979   }
2980   Bdisassembled = PETSC_FALSE;
2981   if (!aij->B) {
2982     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2983     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2984     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2985     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2986     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
2987     Bdisassembled = PETSC_TRUE;
2988   }
2989   if (B) {
2990     Baij = (Mat_SeqAIJ*)B->data;
2991     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2992       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
2993       for (i=0; i<B->rmap->n; i++) {
2994         nz[i] = Baij->i[i+1] - Baij->i[i];
2995       }
2996       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
2997       ierr = PetscFree(nz);CHKERRQ(ierr);
2998     }
2999 
3000     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3001     shift = rend-rstart;
3002     count = 0;
3003     rowindices = NULL;
3004     colindices = NULL;
3005     if (rowemb) {
3006       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3007     }
3008     if (ocolemb) {
3009       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3010     }
3011     for (i=0; i<B->rmap->n; i++) {
3012       PetscInt row;
3013       row = i;
3014       if (rowindices) row = rowindices[i];
3015       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3016         col  = Baij->j[count];
3017         if (colindices) col = colindices[col];
3018         if (Bdisassembled && col>=rstart) col += shift;
3019         v    = Baij->a[count];
3020         ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3021         ++count;
3022       }
3023     }
3024     /* No assembly for aij->B is necessary. */
3025     /* FIXME: set aij->B's nonzerostate correctly. */
3026   } else {
3027     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3028   }
3029   C->preallocated  = PETSC_TRUE;
3030   C->was_assembled = PETSC_FALSE;
3031   C->assembled     = PETSC_FALSE;
3032    /*
3033       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3034       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3035    */
3036   PetscFunctionReturn(0);
3037 }
3038 
3039 /*
3040   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3041  */
3042 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3043 {
3044   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3045 
3046   PetscFunctionBegin;
3047   PetscValidPointer(A,2);
3048   PetscValidPointer(B,3);
3049   /* FIXME: make sure C is assembled */
3050   *A = aij->A;
3051   *B = aij->B;
3052   /* Note that we don't incref *A and *B, so be careful! */
3053   PetscFunctionReturn(0);
3054 }
3055 
3056 /*
3057   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3058   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3059 */
3060 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3061                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3062                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3063                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3064                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3065 {
3066   PetscErrorCode ierr;
3067   PetscMPIInt    size,flag;
3068   PetscInt       i,ii,cismax,ispar;
3069   Mat            *A,*B;
3070   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3071 
3072   PetscFunctionBegin;
3073   if (!ismax) PetscFunctionReturn(0);
3074 
3075   for (i = 0, cismax = 0; i < ismax; ++i) {
3076     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr);
3077     PetscCheckFalse(flag != MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3078     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr);
3079     if (size > 1) ++cismax;
3080   }
3081 
3082   /*
3083      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3084      ispar counts the number of parallel ISs across C's comm.
3085   */
3086   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
3087   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3088     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3089     PetscFunctionReturn(0);
3090   }
3091 
3092   /* if (ispar) */
3093   /*
3094     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3095     These are used to extract the off-diag portion of the resulting parallel matrix.
3096     The row IS for the off-diag portion is the same as for the diag portion,
3097     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3098   */
3099   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3100   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3101   for (i = 0, ii = 0; i < ismax; ++i) {
3102     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3103     if (size > 1) {
3104       /*
3105          TODO: This is the part that's ***NOT SCALABLE***.
3106          To fix this we need to extract just the indices of C's nonzero columns
3107          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3108          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3109          to be done without serializing on the IS list, so, most likely, it is best
3110          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3111       */
3112       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3113       /* Now we have to
3114          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3115              were sorted on each rank, concatenated they might no longer be sorted;
3116          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3117              indices in the nondecreasing order to the original index positions.
3118          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3119       */
3120       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3121       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3122       ++ii;
3123     }
3124   }
3125   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3126   for (i = 0, ii = 0; i < ismax; ++i) {
3127     PetscInt       j,issize;
3128     const PetscInt *indices;
3129 
3130     /*
3131        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3132      */
3133     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3134     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3135     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3136     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3137     for (j = 1; j < issize; ++j) {
3138       PetscCheckFalse(indices[j] == indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3139     }
3140     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3141     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3142     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3143     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3144     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3145     for (j = 1; j < issize; ++j) {
3146       PetscCheckFalse(indices[j-1] == indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3147     }
3148     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3149     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3150     if (size > 1) {
3151       cisrow[ii] = isrow[i];
3152       ++ii;
3153     }
3154   }
3155   /*
3156     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3157     array of sequential matrices underlying the resulting parallel matrices.
3158     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3159     contain duplicates.
3160 
3161     There are as many diag matrices as there are original index sets. There are only as many parallel
3162     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3163 
3164     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3165     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3166       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3167       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3168       with A[i] and B[ii] extracted from the corresponding MPI submat.
3169     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3170       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3171       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3172       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3173       values into A[i] and B[ii] sitting inside the corresponding submat.
3174     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3175       A[i], B[ii], AA[i] or BB[ii] matrices.
3176   */
3177   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3178   if (scall == MAT_INITIAL_MATRIX) {
3179     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3180   }
3181 
3182   /* Now obtain the sequential A and B submatrices separately. */
3183   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3184   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3185   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3186 
3187   /*
3188     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3189     matrices A & B have been extracted directly into the parallel matrices containing them, or
3190     simply into the sequential matrix identical with the corresponding A (if size == 1).
3191     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3192     to have the same sparsity pattern.
3193     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3194     must be constructed for C. This is done by setseqmat(s).
3195   */
3196   for (i = 0, ii = 0; i < ismax; ++i) {
3197     /*
3198        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3199        That way we can avoid sorting and computing permutations when reusing.
3200        To this end:
3201         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3202         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3203           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3204     */
3205     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3206 
3207     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3208     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3209     if (size > 1) {
3210       if (scall == MAT_INITIAL_MATRIX) {
3211         ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3212         ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3213         ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3214         ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3215         ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3216       }
3217       /*
3218         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3219       */
3220       {
3221         Mat AA = A[i],BB = B[ii];
3222 
3223         if (AA || BB) {
3224           ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3225           ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3226           ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3227         }
3228         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3229       }
3230       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3231       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3232       ++ii;
3233     } else { /* if (size == 1) */
3234       if (scall == MAT_REUSE_MATRIX) {
3235         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3236       }
3237       if (isrow_p[i] || iscol_p[i]) {
3238         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3239         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3240         /* Otherwise A is extracted straight into (*submats)[i]. */
3241         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3242         ierr = MatDestroy(A+i);CHKERRQ(ierr);
3243       } else (*submat)[i] = A[i];
3244     }
3245     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3246     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3247   }
3248   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3249   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3250   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3251   ierr = PetscFree(A);CHKERRQ(ierr);
3252   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3253   PetscFunctionReturn(0);
3254 }
3255 
3256 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3257 {
3258   PetscErrorCode ierr;
3259 
3260   PetscFunctionBegin;
3261   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3262   PetscFunctionReturn(0);
3263 }
3264