xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 2fa40bb9206b96114faa7cb222621ec184d31cd2)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscErrorCode ierr;
24   PetscInt       i;
25 
26   PetscFunctionBegin;
27   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28   for (i=0; i<ov; ++i) {
29     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
30   }
31   PetscFunctionReturn(0);
32 }
33 
34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
35 {
36   PetscErrorCode ierr;
37   PetscInt       i;
38 
39   PetscFunctionBegin;
40   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
41   for (i=0; i<ov; ++i) {
42     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
43   }
44   PetscFunctionReturn(0);
45 }
46 
47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
48 {
49   PetscErrorCode ierr;
50   MPI_Comm       comm;
51   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
52   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
53   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
54   const PetscInt *indices_i,**indices;
55   PetscLayout    rmap;
56   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
57   PetscSF        sf;
58   PetscSFNode    *remote;
59 
60   PetscFunctionBegin;
61   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
62   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
63   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
64   /* get row map to determine where rows should be going */
65   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
66   /* retrieve IS data and put all together so that we
67    * can optimize communication
68    *  */
69   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
70   for (i=0,tlength=0; i<nidx; i++) {
71     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
72     tlength += length[i];
73     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
74   }
75   /* find these rows on remote processors */
76   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
77   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
78   nrrows = 0;
79   for (i=0; i<nidx; i++) {
80     length_i  = length[i];
81     indices_i = indices[i];
82     for (j=0; j<length_i; j++) {
83       owner = -1;
84       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
85       /* remote processors */
86       if (owner != rank) {
87         tosizes_temp[owner]++; /* number of rows to owner */
88         rrow_ranks[nrrows]  = owner; /* processor */
89         rrow_isids[nrrows]   = i; /* is id */
90         remoterows[nrrows++] = indices_i[j]; /* row */
91       }
92     }
93     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
94   }
95   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
96   /* test if we need to exchange messages
97    * generally speaking, we do not need to exchange
98    * data when overlap is 1
99    * */
100   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr);
101   /* we do not have any messages
102    * It usually corresponds to overlap 1
103    * */
104   if (!reducednrrows) {
105     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
106     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
107     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
108     PetscFunctionReturn(0);
109   }
110   nto = 0;
111   /* send sizes and ranks for building a two-sided communcation */
112   for (i=0; i<size; i++) {
113     if (tosizes_temp[i]) {
114       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
115       tosizes_temp[i] = nto; /* a map from processor to index */
116       toranks[nto++]  = i; /* processor */
117     }
118   }
119   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
120   toffsets[0] = 0;
121   for (i=0; i<nto; i++) {
122     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
123     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
124   }
125   /* send information to other processors */
126   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
130   nrecvrows = 0;
131   for (i=0; i<nfrom; i++) {
132     for (j=0; j<fromsizes[2*i]; j++) {
133       remote[nrecvrows].rank    = fromranks[i];
134       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135     }
136   }
137   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
138   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
139   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
140   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
141   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
142   /* message pair <no of is, row>  */
143   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
144   for (i=0; i<nrrows; i++) {
145     owner = rrow_ranks[i]; /* processor */
146     j     = tosizes_temp[owner]; /* index */
147     todata[toffsets[j]++] = rrow_isids[i];
148     todata[toffsets[j]++] = remoterows[i];
149   }
150   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
151   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
152   ierr = PetscFree(toffsets);CHKERRQ(ierr);
153   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
154   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
155   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
156   /* send rows belonging to the remote so that then we could get the overlapping data back */
157   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
158   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
159   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
160   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
161   ierr = PetscFree(fromranks);CHKERRQ(ierr);
162   nrecvrows = 0;
163   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
164   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
165   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
166   nrecvrows = 0;
167   for (i=0; i<nto; i++) {
168     for (j=0; j<tosizes[2*i]; j++) {
169       remote[nrecvrows].rank    = toranks[i];
170       remote[nrecvrows++].index = tosizes[2*i+1]+j;
171     }
172   }
173   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
174   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
175   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
177   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
178   /* overlap communication and computation */
179   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
180   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
181   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
182   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
183   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
184   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
185   ierr = PetscFree(toranks);CHKERRQ(ierr);
186   ierr = PetscFree(tosizes);CHKERRQ(ierr);
187   ierr = PetscFree(todata);CHKERRQ(ierr);
188   PetscFunctionReturn(0);
189 }
190 
191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
192 {
193   PetscInt         *isz,isz_i,i,j,is_id, data_size;
194   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
195   const PetscInt   *indices_i_temp;
196   MPI_Comm         *iscomms;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++) {
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++) {
209     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
210     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
211     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
212     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
214   }
215   /* retrieve information to get row id and its overlap */
216   for (i=0; i<nrecvs;) {
217     is_id     = recvdata[i++];
218     data_size = recvdata[i++];
219     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
220     isz_i     = isz[is_id];
221     for (j=0; j< data_size; j++) {
222       col = recvdata[i++];
223       indices_i[isz_i++] = col;
224     }
225     isz[is_id] = isz_i;
226   }
227   /* remove duplicate entities */
228   for (i=0; i<nidx; i++) {
229     indices_i = indices_temp+(max_lsize+nrecvs)*i;
230     isz_i     = isz[i];
231     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
232     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
233     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
234   }
235   ierr = PetscFree(isz);CHKERRQ(ierr);
236   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
237   PetscFunctionReturn(0);
238 }
239 
240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
241 {
242   PetscLayout       rmap,cmap;
243   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
244   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
245   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
246   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
247   Mat               amat,bmat;
248   PetscMPIInt       rank;
249   PetscBool         done;
250   MPI_Comm          comm;
251   PetscErrorCode    ierr;
252 
253   PetscFunctionBegin;
254   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
255   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
256   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
257   /* Even if the mat is symmetric, we still assume it is not symmetric */
258   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   /* total number of nonzero values is used to estimate the memory usage in the next step */
263   tnz  = ai[an]+bi[bn];
264   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
265   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
267   /* to find the longest message */
268   max_fszs = 0;
269   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
270   /* better way to estimate number of nonzero in the mat??? */
271   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
272   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
273   rows_pos  = 0;
274   totalrows = 0;
275   for (i=0; i<nfrom; i++) {
276     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
277     /* group data together */
278     for (j=0; j<fromsizes[2*i]; j+=2) {
279       is_id                       = fromrows[rows_pos++];/* no of is */
280       rows_i                      = rows_data[is_id];
281       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
282     }
283     /* estimate a space to avoid multiple allocations  */
284     for (j=0; j<nidx; j++) {
285       indvc_ij = 0;
286       rows_i   = rows_data[j];
287       for (l=0; l<rows_pos_i[j]; l++) {
288         row    = rows_i[l]-rstart;
289         start  = ai[row];
290         end    = ai[row+1];
291         for (k=start; k<end; k++) { /* Amat */
292           col = aj[k] + cstart;
293           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
294         }
295         start = bi[row];
296         end   = bi[row+1];
297         for (k=start; k<end; k++) { /* Bmat */
298           col = gcols[bj[k]];
299           indices_tmp[indvc_ij++] = col;
300         }
301       }
302       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
303       indv_counts[i*nidx+j] = indvc_ij;
304       totalrows            += indvc_ij;
305     }
306   }
307   /* message triple <no of is, number of rows, rows> */
308   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
309   totalrows = 0;
310   rows_pos  = 0;
311   /* use this code again */
312   for (i=0;i<nfrom;i++) {
313     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
314     for (j=0; j<fromsizes[2*i]; j+=2) {
315       is_id                       = fromrows[rows_pos++];
316       rows_i                      = rows_data[is_id];
317       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
318     }
319     /* add data  */
320     for (j=0; j<nidx; j++) {
321       if (!indv_counts[i*nidx+j]) continue;
322       indvc_ij = 0;
323       sbdata[totalrows++] = j;
324       sbdata[totalrows++] = indv_counts[i*nidx+j];
325       sbsizes[2*i]       += 2;
326       rows_i              = rows_data[j];
327       for (l=0; l<rows_pos_i[j]; l++) {
328         row   = rows_i[l]-rstart;
329         start = ai[row];
330         end   = ai[row+1];
331         for (k=start; k<end; k++) { /* Amat */
332           col = aj[k] + cstart;
333           indices_tmp[indvc_ij++] = col;
334         }
335         start = bi[row];
336         end   = bi[row+1];
337         for (k=start; k<end; k++) { /* Bmat */
338           col = gcols[bj[k]];
339           indices_tmp[indvc_ij++] = col;
340         }
341       }
342       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
343       sbsizes[2*i]  += indvc_ij;
344       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
345       totalrows += indvc_ij;
346     }
347   }
348   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
349   offsets[0] = 0;
350   for (i=0; i<nfrom; i++) {
351     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352     sbsizes[2*i+1] = offsets[i];
353   }
354   ierr = PetscFree(offsets);CHKERRQ(ierr);
355   if (sbrowsizes) *sbrowsizes = sbsizes;
356   if (sbrows) *sbrows = sbdata;
357   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
359   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364 {
365   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367   PetscInt          lsize,lsize_tmp;
368   PetscMPIInt       rank,owner;
369   Mat               amat,bmat;
370   PetscBool         done;
371   PetscLayout       cmap,rmap;
372   MPI_Comm          comm;
373   PetscErrorCode    ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
377   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
378   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
379   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
380   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   /* is it a safe way to compute number of nonzero values ? */
384   tnz  = ai[an]+bi[bn];
385   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
387   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
388   /* it is a better way to estimate memory than the old implementation
389    * where global size of matrix is used
390    * */
391   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
392   for (i=0; i<nidx; i++) {
393     MPI_Comm iscomm;
394 
395     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
396     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
397     lsize_tmp = 0;
398     for (j=0; j<lsize; j++) {
399       owner = -1;
400       row   = indices[j];
401       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
402       if (owner != rank) continue;
403       /* local number */
404       row  -= rstart;
405       start = ai[row];
406       end   = ai[row+1];
407       for (k=start; k<end; k++) { /* Amat */
408         col = aj[k] + cstart;
409         indices_temp[lsize_tmp++] = col;
410       }
411       start = bi[row];
412       end   = bi[row+1];
413       for (k=start; k<end; k++) { /* Bmat */
414         col = gcols[bj[k]];
415         indices_temp[lsize_tmp++] = col;
416       }
417     }
418    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
419    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
420    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
421    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
422    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
423    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
424   }
425   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
426   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
427   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 /*
432   Sample message format:
433   If a processor A wants processor B to process some elements corresponding
434   to index sets is[1],is[5]
435   mesg [0] = 2   (no of index sets in the mesg)
436   -----------
437   mesg [1] = 1 => is[1]
438   mesg [2] = sizeof(is[1]);
439   -----------
440   mesg [3] = 5  => is[5]
441   mesg [4] = sizeof(is[5]);
442   -----------
443   mesg [5]
444   mesg [n]  datas[1]
445   -----------
446   mesg[n+1]
447   mesg[m]  data(is[5])
448   -----------
449 
450   Notes:
451   nrqs - no of requests sent (or to be sent out)
452   nrqr - no of requests received (which have to be or which have been processed)
453 */
454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
455 {
456   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
457   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
458   const PetscInt **idx,*idx_i;
459   PetscInt       *n,**data,len;
460 #if defined(PETSC_USE_CTABLE)
461   PetscTable     *table_data,table_data_i;
462   PetscInt       *tdata,tcount,tcount_max;
463 #else
464   PetscInt       *data_i,*d_p;
465 #endif
466   PetscErrorCode ierr;
467   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
468   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
469   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
470   PetscBT        *table;
471   MPI_Comm       comm;
472   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
473   MPI_Status     *recv_status;
474   MPI_Comm       *iscomms;
475   char           *t_p;
476 
477   PetscFunctionBegin;
478   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
479   size = c->size;
480   rank = c->rank;
481   M    = C->rmap->N;
482 
483   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
485 
486   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
487 
488   for (i=0; i<imax; i++) {
489     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
490     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
491   }
492 
493   /* evaluate communication - mesg to who,length of mesg, and buffer space
494      required. Based on this, buffers are allocated, and data copied into them  */
495   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
496   for (i=0; i<imax; i++) {
497     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
498     idx_i = idx[i];
499     len   = n[i];
500     for (j=0; j<len; j++) {
501       row = idx_i[j];
502       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
503       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
504       w4[proc]++;
505     }
506     for (j=0; j<size; j++) {
507       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
508     }
509   }
510 
511   nrqs     = 0;              /* no of outgoing messages */
512   msz      = 0;              /* total mesg length (for all proc */
513   w1[rank] = 0;              /* no mesg sent to intself */
514   w3[rank] = 0;
515   for (i=0; i<size; i++) {
516     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
517   }
518   /* pa - is list of processors to communicate with */
519   ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr);
520   for (i=0,j=0; i<size; i++) {
521     if (w1[i]) {pa[j] = i; j++;}
522   }
523 
524   /* Each message would have a header = 1 + 2*(no of IS) + data */
525   for (i=0; i<nrqs; i++) {
526     j      = pa[i];
527     w1[j] += w2[j] + 2*w3[j];
528     msz   += w1[j];
529   }
530 
531   /* Determine the number of messages to expect, their lengths, from from-ids */
532   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
533   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
534 
535   /* Now post the Irecvs corresponding to these messages */
536   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
537 
538   /* Allocate Memory for outgoing messages */
539   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
540   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
541   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
542 
543   {
544     PetscInt *iptr = tmp,ict  = 0;
545     for (i=0; i<nrqs; i++) {
546       j         = pa[i];
547       iptr     +=  ict;
548       outdat[j] = iptr;
549       ict       = w1[j];
550     }
551   }
552 
553   /* Form the outgoing messages */
554   /* plug in the headers */
555   for (i=0; i<nrqs; i++) {
556     j            = pa[i];
557     outdat[j][0] = 0;
558     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
559     ptr[j]       = outdat[j] + 2*w3[j] + 1;
560   }
561 
562   /* Memory for doing local proc's work */
563   {
564     PetscInt M_BPB_imax = 0;
565 #if defined(PETSC_USE_CTABLE)
566     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
567     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       ierr = PetscTableCreate(n[i],M,&table_data[i]);CHKERRQ(ierr);
570     }
571     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
572     for (i=0; i<imax; i++) {
573       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
574     }
575 #else
576     PetscInt Mimax = 0;
577     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
578     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
579     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
580     for (i=0; i<imax; i++) {
581       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582       data[i]  = d_p + M*i;
583     }
584 #endif
585   }
586 
587   /* Parse the IS and update local tables and the outgoing buf with the data */
588   {
589     PetscInt n_i,isz_i,*outdat_j,ctr_j;
590     PetscBT  table_i;
591 
592     for (i=0; i<imax; i++) {
593       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
594       n_i     = n[i];
595       table_i = table[i];
596       idx_i   = idx[i];
597 #if defined(PETSC_USE_CTABLE)
598       table_data_i = table_data[i];
599 #else
600       data_i  = data[i];
601 #endif
602       isz_i   = isz[i];
603       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
604         row  = idx_i[j];
605         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
606         if (proc != rank) { /* copy to the outgoing buffer */
607           ctr[proc]++;
608           *ptr[proc] = row;
609           ptr[proc]++;
610         } else if (!PetscBTLookupSet(table_i,row)) {
611 #if defined(PETSC_USE_CTABLE)
612           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
613 #else
614           data_i[isz_i] = row; /* Update the local table */
615 #endif
616           isz_i++;
617         }
618       }
619       /* Update the headers for the current IS */
620       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
621         if ((ctr_j = ctr[j])) {
622           outdat_j        = outdat[j];
623           k               = ++outdat_j[0];
624           outdat_j[2*k]   = ctr_j;
625           outdat_j[2*k-1] = i;
626         }
627       }
628       isz[i] = isz_i;
629     }
630   }
631 
632   /*  Now  post the sends */
633   ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
634   for (i=0; i<nrqs; ++i) {
635     j    = pa[i];
636     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
637   }
638 
639   /* No longer need the original indices */
640   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
641   for (i=0; i<imax; ++i) {
642     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
643     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
644   }
645   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
646 
647   for (i=0; i<imax; ++i) {
648     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
649   }
650 
651   /* Do Local work */
652 #if defined(PETSC_USE_CTABLE)
653   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
654 #else
655   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
656 #endif
657 
658   /* Receive messages */
659   ierr = PetscMalloc1(nrqr,&recv_status);CHKERRQ(ierr);
660   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);
661   ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
662 
663   /* Phase 1 sends are complete - deallocate buffers */
664   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
665   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
666 
667   ierr = PetscMalloc1(nrqr,&xdata);CHKERRQ(ierr);
668   ierr = PetscMalloc1(nrqr,&isz1);CHKERRQ(ierr);
669   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
670   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
671   ierr = PetscFree(rbuf);CHKERRQ(ierr);
672 
673   /* Send the data back */
674   /* Do a global reduction to know the buffer space req for incoming messages */
675   {
676     PetscMPIInt *rw1;
677 
678     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
679 
680     for (i=0; i<nrqr; ++i) {
681       proc = recv_status[i].MPI_SOURCE;
682 
683       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
684       rw1[proc] = isz1[i];
685     }
686     ierr = PetscFree(onodes1);CHKERRQ(ierr);
687     ierr = PetscFree(olengths1);CHKERRQ(ierr);
688 
689     /* Determine the number of messages to expect, their lengths, from from-ids */
690     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
691     ierr = PetscFree(rw1);CHKERRQ(ierr);
692   }
693   /* Now post the Irecvs corresponding to these messages */
694   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
695 
696   /* Now  post the sends */
697   ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr);
698   for (i=0; i<nrqr; ++i) {
699     j    = recv_status[i].MPI_SOURCE;
700     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr);
701   }
702 
703   /* receive work done on other processors */
704   {
705     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
706     PetscMPIInt idex;
707     PetscBT     table_i;
708 
709     for (i=0; i<nrqs; ++i) {
710       ierr = MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);CHKERRMPI(ierr);
711       /* Process the message */
712       rbuf2_i = rbuf2[idex];
713       ct1     = 2*rbuf2_i[0]+1;
714       jmax    = rbuf2[idex][0];
715       for (j=1; j<=jmax; j++) {
716         max     = rbuf2_i[2*j];
717         is_no   = rbuf2_i[2*j-1];
718         isz_i   = isz[is_no];
719         table_i = table[is_no];
720 #if defined(PETSC_USE_CTABLE)
721         table_data_i = table_data[is_no];
722 #else
723         data_i  = data[is_no];
724 #endif
725         for (k=0; k<max; k++,ct1++) {
726           row = rbuf2_i[ct1];
727           if (!PetscBTLookupSet(table_i,row)) {
728 #if defined(PETSC_USE_CTABLE)
729             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
730 #else
731             data_i[isz_i] = row;
732 #endif
733             isz_i++;
734           }
735         }
736         isz[is_no] = isz_i;
737       }
738     }
739 
740     ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
768   }
769 
770   ierr = PetscFree(iscomms);CHKERRQ(ierr);
771   ierr = PetscFree(onodes2);CHKERRQ(ierr);
772   ierr = PetscFree(olengths2);CHKERRQ(ierr);
773 
774   ierr = PetscFree(pa);CHKERRQ(ierr);
775   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
776   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
779   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
780   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
781   ierr = PetscFree(recv_status);CHKERRQ(ierr);
782   if (xdata) {
783     ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
784     ierr = PetscFree(xdata);CHKERRQ(ierr);
785   }
786   ierr = PetscFree(isz1);CHKERRQ(ierr);
787 #if defined(PETSC_USE_CTABLE)
788   for (i=0; i<imax; i++) {
789     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
790   }
791   ierr = PetscFree(table_data);CHKERRQ(ierr);
792   ierr = PetscFree(tdata);CHKERRQ(ierr);
793   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
794 #else
795   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
796 #endif
797   PetscFunctionReturn(0);
798 }
799 
800 /*
801    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
802        the work on the local processor.
803 
804      Inputs:
805       C      - MAT_MPIAIJ;
806       imax - total no of index sets processed at a time;
807       table  - an array of char - size = m bits.
808 
809      Output:
810       isz    - array containing the count of the solution elements corresponding
811                to each index set;
812       data or table_data  - pointer to the solutions
813 */
814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
815 {
816   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
817   Mat        A  = c->A,B = c->B;
818   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
819   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
820   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
821   PetscBT    table_i;
822 #if defined(PETSC_USE_CTABLE)
823   PetscTable         table_data_i;
824   PetscErrorCode     ierr;
825   PetscTablePosition tpos;
826   PetscInt           tcount,*tdata;
827 #else
828   PetscInt           *data_i;
829 #endif
830 
831   PetscFunctionBegin;
832   rstart = C->rmap->rstart;
833   cstart = C->cmap->rstart;
834   ai     = a->i;
835   aj     = a->j;
836   bi     = b->i;
837   bj     = b->j;
838   garray = c->garray;
839 
840   for (i=0; i<imax; i++) {
841 #if defined(PETSC_USE_CTABLE)
842     /* copy existing entries of table_data_i into tdata[] */
843     table_data_i = table_data[i];
844     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
845     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]);
846 
847     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
848     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
849     while (tpos) {
850       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
851       tdata[--j] = --row;
852       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount);
853     }
854 #else
855     data_i  = data[i];
856 #endif
857     table_i = table[i];
858     isz_i   = isz[i];
859     max     = isz[i];
860 
861     for (j=0; j<max; j++) {
862 #if defined(PETSC_USE_CTABLE)
863       row   = tdata[j] - rstart;
864 #else
865       row   = data_i[j] - rstart;
866 #endif
867       start = ai[row];
868       end   = ai[row+1];
869       for (k=start; k<end; k++) { /* Amat */
870         val = aj[k] + cstart;
871         if (!PetscBTLookupSet(table_i,val)) {
872 #if defined(PETSC_USE_CTABLE)
873           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
874 #else
875           data_i[isz_i] = val;
876 #endif
877           isz_i++;
878         }
879       }
880       start = bi[row];
881       end   = bi[row+1];
882       for (k=start; k<end; k++) { /* Bmat */
883         val = garray[bj[k]];
884         if (!PetscBTLookupSet(table_i,val)) {
885 #if defined(PETSC_USE_CTABLE)
886           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
887 #else
888           data_i[isz_i] = val;
889 #endif
890           isz_i++;
891         }
892       }
893     }
894     isz[i] = isz_i;
895 
896 #if defined(PETSC_USE_CTABLE)
897     ierr = PetscFree(tdata);CHKERRQ(ierr);
898 #endif
899   }
900   PetscFunctionReturn(0);
901 }
902 
903 /*
904       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
905          and return the output
906 
907          Input:
908            C    - the matrix
909            nrqr - no of messages being processed.
910            rbuf - an array of pointers to the received requests
911 
912          Output:
913            xdata - array of messages to be sent back
914            isz1  - size of each message
915 
916   For better efficiency perhaps we should malloc separately each xdata[i],
917 then if a remalloc is required we need only copy the data for that one row
918 rather then all previous rows as it is now where a single large chunk of
919 memory is used.
920 
921 */
922 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
923 {
924   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
925   Mat            A  = c->A,B = c->B;
926   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
927   PetscErrorCode ierr;
928   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
929   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
930   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
931   PetscInt       *rbuf_i,kmax,rbuf_0;
932   PetscBT        xtable;
933 
934   PetscFunctionBegin;
935   m      = C->rmap->N;
936   rstart = C->rmap->rstart;
937   cstart = C->cmap->rstart;
938   ai     = a->i;
939   aj     = a->j;
940   bi     = b->i;
941   bj     = b->j;
942   garray = c->garray;
943 
944   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
945     rbuf_i =  rbuf[i];
946     rbuf_0 =  rbuf_i[0];
947     ct    += rbuf_0;
948     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
949   }
950 
951   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
952   else max1 = 1;
953   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
954   if (nrqr) {
955     ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
956     ++no_malloc;
957   }
958   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
959   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
960 
961   ct3 = 0;
962   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
963     rbuf_i =  rbuf[i];
964     rbuf_0 =  rbuf_i[0];
965     ct1    =  2*rbuf_0+1;
966     ct2    =  ct1;
967     ct3   += ct1;
968     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
969       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
970       oct2 = ct2;
971       kmax = rbuf_i[2*j];
972       for (k=0; k<kmax; k++,ct1++) {
973         row = rbuf_i[ct1];
974         if (!PetscBTLookupSet(xtable,row)) {
975           if (!(ct3 < mem_estimate)) {
976             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
977             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
978             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
979             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
980             xdata[0]     = tmp;
981             mem_estimate = new_estimate; ++no_malloc;
982             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
983           }
984           xdata[i][ct2++] = row;
985           ct3++;
986         }
987       }
988       for (k=oct2,max2=ct2; k<max2; k++) {
989         row   = xdata[i][k] - rstart;
990         start = ai[row];
991         end   = ai[row+1];
992         for (l=start; l<end; l++) {
993           val = aj[l] + cstart;
994           if (!PetscBTLookupSet(xtable,val)) {
995             if (!(ct3 < mem_estimate)) {
996               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
997               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
998               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
999               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1000               xdata[0]     = tmp;
1001               mem_estimate = new_estimate; ++no_malloc;
1002               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1003             }
1004             xdata[i][ct2++] = val;
1005             ct3++;
1006           }
1007         }
1008         start = bi[row];
1009         end   = bi[row+1];
1010         for (l=start; l<end; l++) {
1011           val = garray[bj[l]];
1012           if (!PetscBTLookupSet(xtable,val)) {
1013             if (!(ct3 < mem_estimate)) {
1014               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1015               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1016               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1017               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1018               xdata[0]     = tmp;
1019               mem_estimate = new_estimate; ++no_malloc;
1020               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1021             }
1022             xdata[i][ct2++] = val;
1023             ct3++;
1024           }
1025         }
1026       }
1027       /* Update the header*/
1028       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1029       xdata[i][2*j-1] = rbuf_i[2*j-1];
1030     }
1031     xdata[i][0] = rbuf_0;
1032     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1033     isz1[i]     = ct2; /* size of each message */
1034   }
1035   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1036   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1037   PetscFunctionReturn(0);
1038 }
1039 /* -------------------------------------------------------------------------*/
1040 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1041 /*
1042     Every processor gets the entire matrix
1043 */
1044 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1045 {
1046   Mat            B;
1047   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1048   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1049   PetscErrorCode ierr;
1050   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1051   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1052   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1053 
1054   PetscFunctionBegin;
1055   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1056   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr);
1057   if (scall == MAT_INITIAL_MATRIX) {
1058     /* ----------------------------------------------------------------
1059          Tell every processor the number of nonzeros per row
1060     */
1061     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1062     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1063       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1064     }
1065     ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1066     for (i=0; i<size; i++) {
1067       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1068       displs[i]     = A->rmap->range[i];
1069     }
1070 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1071     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1072 #else
1073     sendcount = A->rmap->rend - A->rmap->rstart;
1074     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1075 #endif
1076     /* ---------------------------------------------------------------
1077          Create the sequential matrix of the same type as the local block diagonal
1078     */
1079     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1080     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1081     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1082     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1083     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1084     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1085     **Bin = B;
1086     b     = (Mat_SeqAIJ*)B->data;
1087 
1088     /*--------------------------------------------------------------------
1089        Copy my part of matrix column indices over
1090     */
1091     sendcount  = ad->nz + bd->nz;
1092     jsendbuf   = b->j + b->i[rstarts[rank]];
1093     a_jsendbuf = ad->j;
1094     b_jsendbuf = bd->j;
1095     n          = A->rmap->rend - A->rmap->rstart;
1096     cnt        = 0;
1097     for (i=0; i<n; i++) {
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1161     MatScalar         *sendbuf,*recvbuf;
1162 
1163     ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr);
1164     ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr);
1165     sendcount = ad->nz + bd->nz;
1166     sendbuf   = b->a + b->i[rstarts[rank]];
1167     a_sendbuf = ada;
1168     b_sendbuf = bda;
1169     b_sendj   = bd->j;
1170     n         = A->rmap->rend - A->rmap->rstart;
1171     cnt       = 0;
1172     for (i=0; i<n; i++) {
1173       /* put in lower diagonal portion */
1174       m = bd->i[i+1] - bd->i[i];
1175       while (m > 0) {
1176         /* is it above diagonal (in bd (compressed) numbering) */
1177         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1178         sendbuf[cnt++] = *b_sendbuf++;
1179         m--;
1180         b_sendj++;
1181       }
1182 
1183       /* put in diagonal portion */
1184       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1185         sendbuf[cnt++] = *a_sendbuf++;
1186       }
1187 
1188       /* put in upper diagonal portion */
1189       while (m-- > 0) {
1190         sendbuf[cnt++] = *b_sendbuf++;
1191         b_sendj++;
1192       }
1193     }
1194     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1195 
1196     /* -----------------------------------------------------------------
1197        Gather all numerical values to all processors
1198     */
1199     if (!recvcounts) {
1200       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1201     }
1202     for (i=0; i<size; i++) {
1203       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1204     }
1205     displs[0] = 0;
1206     for (i=1; i<size; i++) {
1207       displs[i] = displs[i-1] + recvcounts[i-1];
1208     }
1209     recvbuf = b->a;
1210 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1211     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1212 #else
1213     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1214 #endif
1215     ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr);
1216     ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr);
1217   }  /* endof (flag == MAT_GET_VALUES) */
1218   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1219 
1220   if (A->symmetric) {
1221     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1222   } else if (A->hermitian) {
1223     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1224   } else if (A->structurally_symmetric) {
1225     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1226   }
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1231 {
1232   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1233   Mat            submat,A = c->A,B = c->B;
1234   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1235   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1236   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1237   const PetscInt *icol,*irow;
1238   PetscInt       nrow,ncol,start;
1239   PetscErrorCode ierr;
1240   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1241   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1242   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1243   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1244   PetscInt       *lens,rmax,ncols,*cols,Crow;
1245 #if defined(PETSC_USE_CTABLE)
1246   PetscTable     cmap,rmap;
1247   PetscInt       *cmap_loc,*rmap_loc;
1248 #else
1249   PetscInt       *cmap,*rmap;
1250 #endif
1251   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1252   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1253   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1254   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1255   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1256   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1257   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1258   MPI_Comm       comm;
1259   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1260   PetscMPIInt    *onodes1,*olengths1,idex,end;
1261   Mat_SubSppt    *smatis1;
1262   PetscBool      isrowsorted,iscolsorted;
1263 
1264   PetscFunctionBegin;
1265   PetscValidLogicalCollectiveInt(C,ismax,2);
1266   PetscValidLogicalCollectiveEnum(C,scall,5);
1267   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1268   ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1269   ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1270   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1271   size = c->size;
1272   rank = c->rank;
1273 
1274   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1275   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1276   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1277   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1278   if (allcolumns) {
1279     icol = NULL;
1280     ncol = C->cmap->N;
1281   } else {
1282     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1283     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1284   }
1285 
1286   if (scall == MAT_INITIAL_MATRIX) {
1287     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1288 
1289     /* Get some new tags to keep the communication clean */
1290     tag1 = ((PetscObject)C)->tag;
1291     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1292     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1293 
1294     /* evaluate communication - mesg to who, length of mesg, and buffer space
1295      required. Based on this, buffers are allocated, and data copied into them */
1296     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1297     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1298 
1299     /* w1[proc] = num of rows owned by proc -- to be requested */
1300     proc = 0;
1301     nrqs = 0; /* num of outgoing messages */
1302     for (j=0; j<nrow; j++) {
1303       row  = irow[j];
1304       if (!isrowsorted) proc = 0;
1305       while (row >= C->rmap->range[proc+1]) proc++;
1306       w1[proc]++;
1307       row2proc[j] = proc; /* map row index to proc */
1308 
1309       if (proc != rank && !w2[proc]) {
1310         w2[proc] = 1; nrqs++;
1311       }
1312     }
1313     w1[rank] = 0;  /* rows owned by self will not be requested */
1314 
1315     ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/
1316     for (proc=0,j=0; proc<size; proc++) {
1317       if (w1[proc]) { pa[j++] = proc;}
1318     }
1319 
1320     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1321     msz = 0;              /* total mesg length (for all procs) */
1322     for (i=0; i<nrqs; i++) {
1323       proc      = pa[i];
1324       w1[proc] += 3;
1325       msz      += w1[proc];
1326     }
1327     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1328 
1329     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1330     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1331     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1332 
1333     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1334        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1335     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1336 
1337     /* Now post the Irecvs corresponding to these messages */
1338     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1339 
1340     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1341     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1342 
1343     /* Allocate Memory for outgoing messages */
1344     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1345     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1346     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1347 
1348     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1349     iptr = tmp;
1350     for (i=0; i<nrqs; i++) {
1351       proc        = pa[i];
1352       sbuf1[proc] = iptr;
1353       iptr       += w1[proc];
1354     }
1355 
1356     /* Form the outgoing messages */
1357     /* Initialize the header space */
1358     for (i=0; i<nrqs; i++) {
1359       proc      = pa[i];
1360       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1361       ptr[proc] = sbuf1[proc] + 3;
1362     }
1363 
1364     /* Parse the isrow and copy data into outbuf */
1365     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1366     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1367       proc = row2proc[j];
1368       if (proc != rank) { /* copy to the outgoing buf*/
1369         *ptr[proc] = irow[j];
1370         ctr[proc]++; ptr[proc]++;
1371       }
1372     }
1373 
1374     /* Update the headers for the current IS */
1375     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1376       if ((ctr_j = ctr[j])) {
1377         sbuf1_j        = sbuf1[j];
1378         k              = ++sbuf1_j[0];
1379         sbuf1_j[2*k]   = ctr_j;
1380         sbuf1_j[2*k-1] = 0;
1381       }
1382     }
1383 
1384     /* Now post the sends */
1385     ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
1386     for (i=0; i<nrqs; ++i) {
1387       proc = pa[i];
1388       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
1389     }
1390 
1391     /* Post Receives to capture the buffer size */
1392     ierr = PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);CHKERRQ(ierr);
1393     ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr);
1394 
1395     if (nrqs) rbuf2[0] = tmp + msz;
1396     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1397 
1398     for (i=0; i<nrqs; ++i) {
1399       proc = pa[i];
1400       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
1401     }
1402 
1403     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1404 
1405     /* Send to other procs the buf size they should allocate */
1406     /* Receive messages*/
1407     ierr = PetscMalloc1(nrqr,&r_status1);CHKERRQ(ierr);
1408     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1409 
1410     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
1411     for (i=0; i<nrqr; ++i) {
1412       req_size[i] = 0;
1413       rbuf1_i        = rbuf1[i];
1414       start          = 2*rbuf1_i[0] + 1;
1415       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
1416       ierr           = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr);
1417       sbuf2_i        = sbuf2[i];
1418       for (j=start; j<end; j++) {
1419         k            = rbuf1_i[j] - rstart;
1420         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1421         sbuf2_i[j]   = ncols;
1422         req_size[i] += ncols;
1423       }
1424       req_source1[i] = r_status1[i].MPI_SOURCE;
1425 
1426       /* form the header */
1427       sbuf2_i[0] = req_size[i];
1428       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1429 
1430       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
1431     }
1432 
1433     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1434     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1435 
1436     /* rbuf2 is received, Post recv column indices a->j */
1437     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
1438 
1439     ierr = PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);CHKERRQ(ierr);
1440     for (i=0; i<nrqs; ++i) {
1441       ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr);
1442       req_source2[i] = r_status2[i].MPI_SOURCE;
1443       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
1444     }
1445 
1446     /* Wait on sends1 and sends2 */
1447     ierr = PetscMalloc1(nrqs,&s_status1);CHKERRQ(ierr);
1448     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);
1449     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1450     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1451 
1452     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);
1453     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1454 
1455     /* Now allocate sending buffers for a->j, and send them off */
1456     ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr);
1457     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1458     if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);}
1459     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1460 
1461     for (i=0; i<nrqr; i++) { /* for each requested message */
1462       rbuf1_i   = rbuf1[i];
1463       sbuf_aj_i = sbuf_aj[i];
1464       ct1       = 2*rbuf1_i[0] + 1;
1465       ct2       = 0;
1466       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1467 
1468       kmax = rbuf1[i][2];
1469       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1470         row    = rbuf1_i[ct1] - rstart;
1471         nzA    = ai[row+1] - ai[row];
1472         nzB    = bi[row+1] - bi[row];
1473         ncols  = nzA + nzB;
1474         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1475 
1476         /* load the column indices for this row into cols*/
1477         cols = sbuf_aj_i + ct2;
1478 
1479         lwrite = 0;
1480         for (l=0; l<nzB; l++) {
1481           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1482         }
1483         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1484         for (l=0; l<nzB; l++) {
1485           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1486         }
1487 
1488         ct2 += ncols;
1489       }
1490       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
1491     }
1492 
1493     /* create column map (cmap): global col of C -> local col of submat */
1494 #if defined(PETSC_USE_CTABLE)
1495     if (!allcolumns) {
1496       ierr = PetscTableCreate(ncol,C->cmap->N,&cmap);CHKERRQ(ierr);
1497       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1498       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1499         if (icol[j] >= cstart && icol[j] <cend) {
1500           cmap_loc[icol[j] - cstart] = j+1;
1501         } else { /* use PetscTable for non-local col indices */
1502           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1503         }
1504       }
1505     } else {
1506       cmap     = NULL;
1507       cmap_loc = NULL;
1508     }
1509     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1510 #else
1511     if (!allcolumns) {
1512       ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1513       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1514     } else {
1515       cmap = NULL;
1516     }
1517 #endif
1518 
1519     /* Create lens for MatSeqAIJSetPreallocation() */
1520     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1521 
1522     /* Compute lens from local part of C */
1523     for (j=0; j<nrow; j++) {
1524       row  = irow[j];
1525       proc = row2proc[j];
1526       if (proc == rank) {
1527         /* diagonal part A = c->A */
1528         ncols = ai[row-rstart+1] - ai[row-rstart];
1529         cols  = aj + ai[row-rstart];
1530         if (!allcolumns) {
1531           for (k=0; k<ncols; k++) {
1532 #if defined(PETSC_USE_CTABLE)
1533             tcol = cmap_loc[cols[k]];
1534 #else
1535             tcol = cmap[cols[k]+cstart];
1536 #endif
1537             if (tcol) lens[j]++;
1538           }
1539         } else { /* allcolumns */
1540           lens[j] = ncols;
1541         }
1542 
1543         /* off-diagonal part B = c->B */
1544         ncols = bi[row-rstart+1] - bi[row-rstart];
1545         cols  = bj + bi[row-rstart];
1546         if (!allcolumns) {
1547           for (k=0; k<ncols; k++) {
1548 #if defined(PETSC_USE_CTABLE)
1549             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1550 #else
1551             tcol = cmap[bmap[cols[k]]];
1552 #endif
1553             if (tcol) lens[j]++;
1554           }
1555         } else { /* allcolumns */
1556           lens[j] += ncols;
1557         }
1558       }
1559     }
1560 
1561     /* Create row map (rmap): global row of C -> local row of submat */
1562 #if defined(PETSC_USE_CTABLE)
1563     ierr = PetscTableCreate(nrow,C->rmap->N,&rmap);CHKERRQ(ierr);
1564     for (j=0; j<nrow; j++) {
1565       row  = irow[j];
1566       proc = row2proc[j];
1567       if (proc == rank) { /* a local row */
1568         rmap_loc[row - rstart] = j;
1569       } else {
1570         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1571       }
1572     }
1573 #else
1574     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1575     for (j=0; j<nrow; j++) {
1576       rmap[irow[j]] = j;
1577     }
1578 #endif
1579 
1580     /* Update lens from offproc data */
1581     /* recv a->j is done */
1582     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
1583     for (i=0; i<nrqs; i++) {
1584       proc    = pa[i];
1585       sbuf1_i = sbuf1[proc];
1586       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1587       ct1     = 2 + 1;
1588       ct2     = 0;
1589       rbuf2_i = rbuf2[i]; /* received length of C->j */
1590       rbuf3_i = rbuf3[i]; /* received C->j */
1591 
1592       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1593       max1   = sbuf1_i[2];
1594       for (k=0; k<max1; k++,ct1++) {
1595 #if defined(PETSC_USE_CTABLE)
1596         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1597         row--;
1598         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1599 #else
1600         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1601 #endif
1602         /* Now, store row index of submat in sbuf1_i[ct1] */
1603         sbuf1_i[ct1] = row;
1604 
1605         nnz = rbuf2_i[ct1];
1606         if (!allcolumns) {
1607           for (l=0; l<nnz; l++,ct2++) {
1608 #if defined(PETSC_USE_CTABLE)
1609             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1610               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1611             } else {
1612               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1613             }
1614 #else
1615             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1616 #endif
1617             if (tcol) lens[row]++;
1618           }
1619         } else { /* allcolumns */
1620           lens[row] += nnz;
1621         }
1622       }
1623     }
1624     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);
1625     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1626 
1627     /* Create the submatrices */
1628     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1629     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1630 
1631     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1632     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1633     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1634     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1635     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1636 
1637     /* create struct Mat_SubSppt and attached it to submat */
1638     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1639     subc = (Mat_SeqAIJ*)submat->data;
1640     subc->submatis1 = smatis1;
1641 
1642     smatis1->id          = 0;
1643     smatis1->nrqs        = nrqs;
1644     smatis1->nrqr        = nrqr;
1645     smatis1->rbuf1       = rbuf1;
1646     smatis1->rbuf2       = rbuf2;
1647     smatis1->rbuf3       = rbuf3;
1648     smatis1->sbuf2       = sbuf2;
1649     smatis1->req_source2 = req_source2;
1650 
1651     smatis1->sbuf1       = sbuf1;
1652     smatis1->ptr         = ptr;
1653     smatis1->tmp         = tmp;
1654     smatis1->ctr         = ctr;
1655 
1656     smatis1->pa           = pa;
1657     smatis1->req_size     = req_size;
1658     smatis1->req_source1  = req_source1;
1659 
1660     smatis1->allcolumns  = allcolumns;
1661     smatis1->singleis    = PETSC_TRUE;
1662     smatis1->row2proc    = row2proc;
1663     smatis1->rmap        = rmap;
1664     smatis1->cmap        = cmap;
1665 #if defined(PETSC_USE_CTABLE)
1666     smatis1->rmap_loc    = rmap_loc;
1667     smatis1->cmap_loc    = cmap_loc;
1668 #endif
1669 
1670     smatis1->destroy     = submat->ops->destroy;
1671     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1672     submat->factortype   = C->factortype;
1673 
1674     /* compute rmax */
1675     rmax = 0;
1676     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1677 
1678   } else { /* scall == MAT_REUSE_MATRIX */
1679     submat = submats[0];
1680     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1681 
1682     subc    = (Mat_SeqAIJ*)submat->data;
1683     rmax    = subc->rmax;
1684     smatis1 = subc->submatis1;
1685     nrqs        = smatis1->nrqs;
1686     nrqr        = smatis1->nrqr;
1687     rbuf1       = smatis1->rbuf1;
1688     rbuf2       = smatis1->rbuf2;
1689     rbuf3       = smatis1->rbuf3;
1690     req_source2 = smatis1->req_source2;
1691 
1692     sbuf1     = smatis1->sbuf1;
1693     sbuf2     = smatis1->sbuf2;
1694     ptr       = smatis1->ptr;
1695     tmp       = smatis1->tmp;
1696     ctr       = smatis1->ctr;
1697 
1698     pa         = smatis1->pa;
1699     req_size   = smatis1->req_size;
1700     req_source1 = smatis1->req_source1;
1701 
1702     allcolumns = smatis1->allcolumns;
1703     row2proc   = smatis1->row2proc;
1704     rmap       = smatis1->rmap;
1705     cmap       = smatis1->cmap;
1706 #if defined(PETSC_USE_CTABLE)
1707     rmap_loc   = smatis1->rmap_loc;
1708     cmap_loc   = smatis1->cmap_loc;
1709 #endif
1710   }
1711 
1712   /* Post recv matrix values */
1713   ierr = PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1714   ierr = PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);CHKERRQ(ierr);
1715   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1716   for (i=0; i<nrqs; ++i) {
1717     ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr);
1718     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
1719   }
1720 
1721   /* Allocate sending buffers for a->a, and send them off */
1722   ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr);
1723   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1724   if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);}
1725   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1726 
1727   for (i=0; i<nrqr; i++) {
1728     rbuf1_i   = rbuf1[i];
1729     sbuf_aa_i = sbuf_aa[i];
1730     ct1       = 2*rbuf1_i[0]+1;
1731     ct2       = 0;
1732     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1733 
1734     kmax = rbuf1_i[2];
1735     for (k=0; k<kmax; k++,ct1++) {
1736       row = rbuf1_i[ct1] - rstart;
1737       nzA = ai[row+1] - ai[row];
1738       nzB = bi[row+1] - bi[row];
1739       ncols  = nzA + nzB;
1740       cworkB = bj + bi[row];
1741       vworkA = a_a + ai[row];
1742       vworkB = b_a + bi[row];
1743 
1744       /* load the column values for this row into vals*/
1745       vals = sbuf_aa_i + ct2;
1746 
1747       lwrite = 0;
1748       for (l=0; l<nzB; l++) {
1749         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1750       }
1751       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1752       for (l=0; l<nzB; l++) {
1753         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1754       }
1755 
1756       ct2 += ncols;
1757     }
1758     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
1759   }
1760 
1761   /* Assemble submat */
1762   /* First assemble the local rows */
1763   for (j=0; j<nrow; j++) {
1764     row  = irow[j];
1765     proc = row2proc[j];
1766     if (proc == rank) {
1767       Crow = row - rstart;  /* local row index of C */
1768 #if defined(PETSC_USE_CTABLE)
1769       row = rmap_loc[Crow]; /* row index of submat */
1770 #else
1771       row = rmap[row];
1772 #endif
1773 
1774       if (allcolumns) {
1775         /* diagonal part A = c->A */
1776         ncols = ai[Crow+1] - ai[Crow];
1777         cols  = aj + ai[Crow];
1778         vals  = a_a + ai[Crow];
1779         i     = 0;
1780         for (k=0; k<ncols; k++) {
1781           subcols[i]   = cols[k] + cstart;
1782           subvals[i++] = vals[k];
1783         }
1784 
1785         /* off-diagonal part B = c->B */
1786         ncols = bi[Crow+1] - bi[Crow];
1787         cols  = bj + bi[Crow];
1788         vals  = b_a + bi[Crow];
1789         for (k=0; k<ncols; k++) {
1790           subcols[i]   = bmap[cols[k]];
1791           subvals[i++] = vals[k];
1792         }
1793 
1794         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1795 
1796       } else { /* !allcolumns */
1797 #if defined(PETSC_USE_CTABLE)
1798         /* diagonal part A = c->A */
1799         ncols = ai[Crow+1] - ai[Crow];
1800         cols  = aj + ai[Crow];
1801         vals  = a_a + ai[Crow];
1802         i     = 0;
1803         for (k=0; k<ncols; k++) {
1804           tcol = cmap_loc[cols[k]];
1805           if (tcol) {
1806             subcols[i]   = --tcol;
1807             subvals[i++] = vals[k];
1808           }
1809         }
1810 
1811         /* off-diagonal part B = c->B */
1812         ncols = bi[Crow+1] - bi[Crow];
1813         cols  = bj + bi[Crow];
1814         vals  = b_a + bi[Crow];
1815         for (k=0; k<ncols; k++) {
1816           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1817           if (tcol) {
1818             subcols[i]   = --tcol;
1819             subvals[i++] = vals[k];
1820           }
1821         }
1822 #else
1823         /* diagonal part A = c->A */
1824         ncols = ai[Crow+1] - ai[Crow];
1825         cols  = aj + ai[Crow];
1826         vals  = a_a + ai[Crow];
1827         i     = 0;
1828         for (k=0; k<ncols; k++) {
1829           tcol = cmap[cols[k]+cstart];
1830           if (tcol) {
1831             subcols[i]   = --tcol;
1832             subvals[i++] = vals[k];
1833           }
1834         }
1835 
1836         /* off-diagonal part B = c->B */
1837         ncols = bi[Crow+1] - bi[Crow];
1838         cols  = bj + bi[Crow];
1839         vals  = b_a + bi[Crow];
1840         for (k=0; k<ncols; k++) {
1841           tcol = cmap[bmap[cols[k]]];
1842           if (tcol) {
1843             subcols[i]   = --tcol;
1844             subvals[i++] = vals[k];
1845           }
1846         }
1847 #endif
1848         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1849       }
1850     }
1851   }
1852 
1853   /* Now assemble the off-proc rows */
1854   for (i=0; i<nrqs; i++) { /* for each requested message */
1855     /* recv values from other processes */
1856     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr);
1857     proc    = pa[idex];
1858     sbuf1_i = sbuf1[proc];
1859     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1860     ct1     = 2 + 1;
1861     ct2     = 0; /* count of received C->j */
1862     ct3     = 0; /* count of received C->j that will be inserted into submat */
1863     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1864     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1865     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1866 
1867     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1868     max1 = sbuf1_i[2];             /* num of rows */
1869     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1870       row = sbuf1_i[ct1]; /* row index of submat */
1871       if (!allcolumns) {
1872         idex = 0;
1873         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1874           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1875           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1876 #if defined(PETSC_USE_CTABLE)
1877             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1878               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1879             } else {
1880               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1881             }
1882 #else
1883             tcol = cmap[rbuf3_i[ct2]];
1884 #endif
1885             if (tcol) {
1886               subcols[idex]   = --tcol; /* may not be sorted */
1887               subvals[idex++] = rbuf4_i[ct2];
1888 
1889               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1890                For reuse, we replace received C->j with index that should be inserted to submat */
1891               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1892             }
1893           }
1894           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1895         } else { /* scall == MAT_REUSE_MATRIX */
1896           submat = submats[0];
1897           subc   = (Mat_SeqAIJ*)submat->data;
1898 
1899           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1900           for (l=0; l<nnz; l++) {
1901             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1902             subvals[idex++] = rbuf4_i[ct2];
1903           }
1904 
1905           bj = subc->j + subc->i[row]; /* sorted column indices */
1906           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1907         }
1908       } else { /* allcolumns */
1909         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1910         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1911         ct2 += nnz;
1912       }
1913     }
1914   }
1915 
1916   /* sending a->a are done */
1917   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);
1918   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1919 
1920   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1921   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1922   submats[0] = submat;
1923 
1924   /* Restore the indices */
1925   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1926   if (!allcolumns) {
1927     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1928   }
1929 
1930   /* Destroy allocated memory */
1931   for (i=0; i<nrqs; ++i) {
1932     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1933   }
1934   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1935   if (sbuf_aa) {
1936     ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1937     ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1938   }
1939 
1940   if (scall == MAT_INITIAL_MATRIX) {
1941     ierr = PetscFree(lens);CHKERRQ(ierr);
1942     if (sbuf_aj) {
1943       ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1944       ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1945     }
1946   }
1947   ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1948   ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1949   PetscFunctionReturn(0);
1950 }
1951 
1952 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1953 {
1954   PetscErrorCode ierr;
1955   PetscInt       ncol;
1956   PetscBool      colflag,allcolumns=PETSC_FALSE;
1957 
1958   PetscFunctionBegin;
1959   /* Allocate memory to hold all the submatrices */
1960   if (scall == MAT_INITIAL_MATRIX) {
1961     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1962   }
1963 
1964   /* Check for special case: each processor gets entire matrix columns */
1965   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1966   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1967   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1968 
1969   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1970   PetscFunctionReturn(0);
1971 }
1972 
1973 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1974 {
1975   PetscErrorCode ierr;
1976   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1977   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1978   Mat_SeqAIJ     *subc;
1979   Mat_SubSppt    *smat;
1980 
1981   PetscFunctionBegin;
1982   /* Check for special case: each processor has a single IS */
1983   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1984     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1985     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1986     PetscFunctionReturn(0);
1987   }
1988 
1989   /* Collect global wantallmatrix and nstages */
1990   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1991   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1992   if (!nmax) nmax = 1;
1993 
1994   if (scall == MAT_INITIAL_MATRIX) {
1995     /* Collect global wantallmatrix and nstages */
1996     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1997       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1998       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1999       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2000       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2001       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2002         wantallmatrix = PETSC_TRUE;
2003 
2004         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2005       }
2006     }
2007 
2008     /* Determine the number of stages through which submatrices are done
2009        Each stage will extract nmax submatrices.
2010        nmax is determined by the matrix column dimension.
2011        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2012     */
2013     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2014 
2015     in[0] = -1*(PetscInt)wantallmatrix;
2016     in[1] = nstages;
2017     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
2018     wantallmatrix = (PetscBool)(-out[0]);
2019     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2020 
2021   } else { /* MAT_REUSE_MATRIX */
2022     if (ismax) {
2023       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2024       smat = subc->submatis1;
2025     } else { /* (*submat)[0] is a dummy matrix */
2026       smat = (Mat_SubSppt*)(*submat)[0]->data;
2027     }
2028     if (!smat) {
2029       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2030       wantallmatrix = PETSC_TRUE;
2031     } else if (smat->singleis) {
2032       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2033       PetscFunctionReturn(0);
2034     } else {
2035       nstages = smat->nstages;
2036     }
2037   }
2038 
2039   if (wantallmatrix) {
2040     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2041     PetscFunctionReturn(0);
2042   }
2043 
2044   /* Allocate memory to hold all the submatrices and dummy submatrices */
2045   if (scall == MAT_INITIAL_MATRIX) {
2046     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2047   }
2048 
2049   for (i=0,pos=0; i<nstages; i++) {
2050     if (pos+nmax <= ismax) max_no = nmax;
2051     else if (pos >= ismax) max_no = 0;
2052     else                   max_no = ismax-pos;
2053 
2054     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2055     if (!max_no) {
2056       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2057         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2058         smat->nstages = nstages;
2059       }
2060       pos++; /* advance to next dummy matrix if any */
2061     } else pos += max_no;
2062   }
2063 
2064   if (ismax && scall == MAT_INITIAL_MATRIX) {
2065     /* save nstages for reuse */
2066     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2067     smat = subc->submatis1;
2068     smat->nstages = nstages;
2069   }
2070   PetscFunctionReturn(0);
2071 }
2072 
2073 /* -------------------------------------------------------------------------*/
2074 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2075 {
2076   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2077   Mat            A  = c->A;
2078   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2079   const PetscInt **icol,**irow;
2080   PetscInt       *nrow,*ncol,start;
2081   PetscErrorCode ierr;
2082   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2083   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2084   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2085   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2086   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2087 #if defined(PETSC_USE_CTABLE)
2088   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2089 #else
2090   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2091 #endif
2092   const PetscInt *irow_i;
2093   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2094   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2095   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2096   MPI_Comm       comm;
2097   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2098   PetscMPIInt    *onodes1,*olengths1,end;
2099   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2100   Mat_SubSppt    *smat_i;
2101   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2102   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2103 
2104   PetscFunctionBegin;
2105   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2106   size = c->size;
2107   rank = c->rank;
2108 
2109   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2110   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2111 
2112   for (i=0; i<ismax; i++) {
2113     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2114     if (!issorted[i]) iscsorted = issorted[i];
2115 
2116     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2117 
2118     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2119     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2120 
2121     /* Check for special case: allcolumn */
2122     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2123     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2124     if (colflag && ncol[i] == C->cmap->N) {
2125       allcolumns[i] = PETSC_TRUE;
2126       icol[i] = NULL;
2127     } else {
2128       allcolumns[i] = PETSC_FALSE;
2129       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2130     }
2131   }
2132 
2133   if (scall == MAT_REUSE_MATRIX) {
2134     /* Assumes new rows are same length as the old rows */
2135     for (i=0; i<ismax; i++) {
2136       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2137       subc = (Mat_SeqAIJ*)submats[i]->data;
2138       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2139 
2140       /* Initial matrix as if empty */
2141       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2142 
2143       smat_i   = subc->submatis1;
2144 
2145       nrqs        = smat_i->nrqs;
2146       nrqr        = smat_i->nrqr;
2147       rbuf1       = smat_i->rbuf1;
2148       rbuf2       = smat_i->rbuf2;
2149       rbuf3       = smat_i->rbuf3;
2150       req_source2 = smat_i->req_source2;
2151 
2152       sbuf1     = smat_i->sbuf1;
2153       sbuf2     = smat_i->sbuf2;
2154       ptr       = smat_i->ptr;
2155       tmp       = smat_i->tmp;
2156       ctr       = smat_i->ctr;
2157 
2158       pa          = smat_i->pa;
2159       req_size    = smat_i->req_size;
2160       req_source1 = smat_i->req_source1;
2161 
2162       allcolumns[i] = smat_i->allcolumns;
2163       row2proc[i]   = smat_i->row2proc;
2164       rmap[i]       = smat_i->rmap;
2165       cmap[i]       = smat_i->cmap;
2166     }
2167 
2168     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2169       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2170       smat_i = (Mat_SubSppt*)submats[0]->data;
2171 
2172       nrqs        = smat_i->nrqs;
2173       nrqr        = smat_i->nrqr;
2174       rbuf1       = smat_i->rbuf1;
2175       rbuf2       = smat_i->rbuf2;
2176       rbuf3       = smat_i->rbuf3;
2177       req_source2 = smat_i->req_source2;
2178 
2179       sbuf1       = smat_i->sbuf1;
2180       sbuf2       = smat_i->sbuf2;
2181       ptr         = smat_i->ptr;
2182       tmp         = smat_i->tmp;
2183       ctr         = smat_i->ctr;
2184 
2185       pa          = smat_i->pa;
2186       req_size    = smat_i->req_size;
2187       req_source1 = smat_i->req_source1;
2188 
2189       allcolumns[0] = PETSC_FALSE;
2190     }
2191   } else { /* scall == MAT_INITIAL_MATRIX */
2192     /* Get some new tags to keep the communication clean */
2193     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2194     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2195 
2196     /* evaluate communication - mesg to who, length of mesg, and buffer space
2197      required. Based on this, buffers are allocated, and data copied into them*/
2198     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2199 
2200     for (i=0; i<ismax; i++) {
2201       jmax   = nrow[i];
2202       irow_i = irow[i];
2203 
2204       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2205       row2proc[i] = row2proc_i;
2206 
2207       if (issorted[i]) proc = 0;
2208       for (j=0; j<jmax; j++) {
2209         if (!issorted[i]) proc = 0;
2210         row = irow_i[j];
2211         while (row >= C->rmap->range[proc+1]) proc++;
2212         w4[proc]++;
2213         row2proc_i[j] = proc; /* map row index to proc */
2214       }
2215       for (j=0; j<size; j++) {
2216         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2217       }
2218     }
2219 
2220     nrqs     = 0;              /* no of outgoing messages */
2221     msz      = 0;              /* total mesg length (for all procs) */
2222     w1[rank] = 0;              /* no mesg sent to self */
2223     w3[rank] = 0;
2224     for (i=0; i<size; i++) {
2225       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2226     }
2227     ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/
2228     for (i=0,j=0; i<size; i++) {
2229       if (w1[i]) { pa[j] = i; j++; }
2230     }
2231 
2232     /* Each message would have a header = 1 + 2*(no of IS) + data */
2233     for (i=0; i<nrqs; i++) {
2234       j      = pa[i];
2235       w1[j] += w2[j] + 2* w3[j];
2236       msz   += w1[j];
2237     }
2238     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2239 
2240     /* Determine the number of messages to expect, their lengths, from from-ids */
2241     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2242     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2243 
2244     /* Now post the Irecvs corresponding to these messages */
2245     ierr = PetscObjectGetNewTag((PetscObject)C,&tag0);CHKERRQ(ierr);
2246     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2247 
2248     /* Allocate Memory for outgoing messages */
2249     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2250     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2251     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2252 
2253     {
2254       PetscInt *iptr = tmp;
2255       k    = 0;
2256       for (i=0; i<nrqs; i++) {
2257         j        = pa[i];
2258         iptr    += k;
2259         sbuf1[j] = iptr;
2260         k        = w1[j];
2261       }
2262     }
2263 
2264     /* Form the outgoing messages. Initialize the header space */
2265     for (i=0; i<nrqs; i++) {
2266       j           = pa[i];
2267       sbuf1[j][0] = 0;
2268       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2269       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2270     }
2271 
2272     /* Parse the isrow and copy data into outbuf */
2273     for (i=0; i<ismax; i++) {
2274       row2proc_i = row2proc[i];
2275       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2276       irow_i = irow[i];
2277       jmax   = nrow[i];
2278       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2279         proc = row2proc_i[j];
2280         if (proc != rank) { /* copy to the outgoing buf*/
2281           ctr[proc]++;
2282           *ptr[proc] = irow_i[j];
2283           ptr[proc]++;
2284         }
2285       }
2286       /* Update the headers for the current IS */
2287       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2288         if ((ctr_j = ctr[j])) {
2289           sbuf1_j        = sbuf1[j];
2290           k              = ++sbuf1_j[0];
2291           sbuf1_j[2*k]   = ctr_j;
2292           sbuf1_j[2*k-1] = i;
2293         }
2294       }
2295     }
2296 
2297     /*  Now  post the sends */
2298     ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
2299     for (i=0; i<nrqs; ++i) {
2300       j    = pa[i];
2301       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr);
2302     }
2303 
2304     /* Post Receives to capture the buffer size */
2305     ierr = PetscMalloc1(nrqs,&r_waits2);CHKERRQ(ierr);
2306     ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr);
2307     if (nrqs) rbuf2[0] = tmp + msz;
2308     for (i=1; i<nrqs; ++i) {
2309       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2310     }
2311     for (i=0; i<nrqs; ++i) {
2312       j    = pa[i];
2313       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
2314     }
2315 
2316     /* Send to other procs the buf size they should allocate */
2317     /* Receive messages*/
2318     ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr);
2319     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2320     {
2321       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2322       PetscInt   *sbuf2_i;
2323 
2324       ierr = MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2325       for (i=0; i<nrqr; ++i) {
2326         req_size[i] = 0;
2327         rbuf1_i        = rbuf1[i];
2328         start          = 2*rbuf1_i[0] + 1;
2329         end            = olengths1[i];
2330         ierr           = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr);
2331         sbuf2_i        = sbuf2[i];
2332         for (j=start; j<end; j++) {
2333           id              = rbuf1_i[j] - rstart;
2334           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2335           sbuf2_i[j]      = ncols;
2336           req_size[i] += ncols;
2337         }
2338         req_source1[i] = onodes1[i];
2339         /* form the header */
2340         sbuf2_i[0] = req_size[i];
2341         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2342 
2343         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
2344       }
2345     }
2346 
2347     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2348     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2349     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2350     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2351 
2352     /* Receive messages*/
2353     ierr = PetscMalloc1(nrqs,&r_waits3);CHKERRQ(ierr);
2354     ierr = MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2355     for (i=0; i<nrqs; ++i) {
2356       ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr);
2357       req_source2[i] = pa[i];
2358       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
2359     }
2360     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2361 
2362     /* Wait on sends1 and sends2 */
2363     ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2364     ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2365     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2366     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2367 
2368     /* Now allocate sending buffers for a->j, and send them off */
2369     ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr);
2370     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2371     if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);}
2372     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2373 
2374     ierr = PetscMalloc1(nrqr,&s_waits3);CHKERRQ(ierr);
2375     {
2376       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2377       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2378       PetscInt cend = C->cmap->rend;
2379       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2380 
2381       for (i=0; i<nrqr; i++) {
2382         rbuf1_i   = rbuf1[i];
2383         sbuf_aj_i = sbuf_aj[i];
2384         ct1       = 2*rbuf1_i[0] + 1;
2385         ct2       = 0;
2386         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2387           kmax = rbuf1[i][2*j];
2388           for (k=0; k<kmax; k++,ct1++) {
2389             row    = rbuf1_i[ct1] - rstart;
2390             nzA    = a_i[row+1] - a_i[row];
2391             nzB    = b_i[row+1] - b_i[row];
2392             ncols  = nzA + nzB;
2393             cworkA = a_j + a_i[row];
2394             cworkB = b_j + b_i[row];
2395 
2396             /* load the column indices for this row into cols */
2397             cols = sbuf_aj_i + ct2;
2398 
2399             lwrite = 0;
2400             for (l=0; l<nzB; l++) {
2401               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2402             }
2403             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2404             for (l=0; l<nzB; l++) {
2405               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2406             }
2407 
2408             ct2 += ncols;
2409           }
2410         }
2411         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
2412       }
2413     }
2414 
2415     /* create col map: global col of C -> local col of submatrices */
2416     {
2417       const PetscInt *icol_i;
2418 #if defined(PETSC_USE_CTABLE)
2419       for (i=0; i<ismax; i++) {
2420         if (!allcolumns[i]) {
2421           ierr = PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2422 
2423           jmax   = ncol[i];
2424           icol_i = icol[i];
2425           cmap_i = cmap[i];
2426           for (j=0; j<jmax; j++) {
2427             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2428           }
2429         } else cmap[i] = NULL;
2430       }
2431 #else
2432       for (i=0; i<ismax; i++) {
2433         if (!allcolumns[i]) {
2434           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2435           jmax   = ncol[i];
2436           icol_i = icol[i];
2437           cmap_i = cmap[i];
2438           for (j=0; j<jmax; j++) {
2439             cmap_i[icol_i[j]] = j+1;
2440           }
2441         } else cmap[i] = NULL;
2442       }
2443 #endif
2444     }
2445 
2446     /* Create lens which is required for MatCreate... */
2447     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2448     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2449 
2450     if (ismax) {
2451       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2452     }
2453     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2454 
2455     /* Update lens from local data */
2456     for (i=0; i<ismax; i++) {
2457       row2proc_i = row2proc[i];
2458       jmax = nrow[i];
2459       if (!allcolumns[i]) cmap_i = cmap[i];
2460       irow_i = irow[i];
2461       lens_i = lens[i];
2462       for (j=0; j<jmax; j++) {
2463         row = irow_i[j];
2464         proc = row2proc_i[j];
2465         if (proc == rank) {
2466           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2467           if (!allcolumns[i]) {
2468             for (k=0; k<ncols; k++) {
2469 #if defined(PETSC_USE_CTABLE)
2470               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2471 #else
2472               tcol = cmap_i[cols[k]];
2473 #endif
2474               if (tcol) lens_i[j]++;
2475             }
2476           } else { /* allcolumns */
2477             lens_i[j] = ncols;
2478           }
2479           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2480         }
2481       }
2482     }
2483 
2484     /* Create row map: global row of C -> local row of submatrices */
2485 #if defined(PETSC_USE_CTABLE)
2486     for (i=0; i<ismax; i++) {
2487       ierr   = PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2488       irow_i = irow[i];
2489       jmax   = nrow[i];
2490       for (j=0; j<jmax; j++) {
2491       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2492       }
2493     }
2494 #else
2495     for (i=0; i<ismax; i++) {
2496       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2497       rmap_i = rmap[i];
2498       irow_i = irow[i];
2499       jmax   = nrow[i];
2500       for (j=0; j<jmax; j++) {
2501         rmap_i[irow_i[j]] = j;
2502       }
2503     }
2504 #endif
2505 
2506     /* Update lens from offproc data */
2507     {
2508       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2509 
2510       ierr    = MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2511       for (tmp2=0; tmp2<nrqs; tmp2++) {
2512         sbuf1_i = sbuf1[pa[tmp2]];
2513         jmax    = sbuf1_i[0];
2514         ct1     = 2*jmax+1;
2515         ct2     = 0;
2516         rbuf2_i = rbuf2[tmp2];
2517         rbuf3_i = rbuf3[tmp2];
2518         for (j=1; j<=jmax; j++) {
2519           is_no  = sbuf1_i[2*j-1];
2520           max1   = sbuf1_i[2*j];
2521           lens_i = lens[is_no];
2522           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2523           rmap_i = rmap[is_no];
2524           for (k=0; k<max1; k++,ct1++) {
2525 #if defined(PETSC_USE_CTABLE)
2526             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2527             row--;
2528             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2529 #else
2530             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2531 #endif
2532             max2 = rbuf2_i[ct1];
2533             for (l=0; l<max2; l++,ct2++) {
2534               if (!allcolumns[is_no]) {
2535 #if defined(PETSC_USE_CTABLE)
2536                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2537 #else
2538                 tcol = cmap_i[rbuf3_i[ct2]];
2539 #endif
2540                 if (tcol) lens_i[row]++;
2541               } else { /* allcolumns */
2542                 lens_i[row]++; /* lens_i[row] += max2 ? */
2543               }
2544             }
2545           }
2546         }
2547       }
2548     }
2549     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2550     ierr = MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2551     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2552 
2553     /* Create the submatrices */
2554     for (i=0; i<ismax; i++) {
2555       PetscInt    rbs,cbs;
2556 
2557       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2558       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2559 
2560       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2561       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2562 
2563       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2564       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2565       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2566 
2567       /* create struct Mat_SubSppt and attached it to submat */
2568       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2569       subc = (Mat_SeqAIJ*)submats[i]->data;
2570       subc->submatis1 = smat_i;
2571 
2572       smat_i->destroy          = submats[i]->ops->destroy;
2573       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2574       submats[i]->factortype   = C->factortype;
2575 
2576       smat_i->id          = i;
2577       smat_i->nrqs        = nrqs;
2578       smat_i->nrqr        = nrqr;
2579       smat_i->rbuf1       = rbuf1;
2580       smat_i->rbuf2       = rbuf2;
2581       smat_i->rbuf3       = rbuf3;
2582       smat_i->sbuf2       = sbuf2;
2583       smat_i->req_source2 = req_source2;
2584 
2585       smat_i->sbuf1       = sbuf1;
2586       smat_i->ptr         = ptr;
2587       smat_i->tmp         = tmp;
2588       smat_i->ctr         = ctr;
2589 
2590       smat_i->pa           = pa;
2591       smat_i->req_size     = req_size;
2592       smat_i->req_source1  = req_source1;
2593 
2594       smat_i->allcolumns  = allcolumns[i];
2595       smat_i->singleis    = PETSC_FALSE;
2596       smat_i->row2proc    = row2proc[i];
2597       smat_i->rmap        = rmap[i];
2598       smat_i->cmap        = cmap[i];
2599     }
2600 
2601     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2602       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2603       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2604       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2605 
2606       /* create struct Mat_SubSppt and attached it to submat */
2607       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2608       submats[0]->data = (void*)smat_i;
2609 
2610       smat_i->destroy          = submats[0]->ops->destroy;
2611       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2612       submats[0]->factortype   = C->factortype;
2613 
2614       smat_i->id          = 0;
2615       smat_i->nrqs        = nrqs;
2616       smat_i->nrqr        = nrqr;
2617       smat_i->rbuf1       = rbuf1;
2618       smat_i->rbuf2       = rbuf2;
2619       smat_i->rbuf3       = rbuf3;
2620       smat_i->sbuf2       = sbuf2;
2621       smat_i->req_source2 = req_source2;
2622 
2623       smat_i->sbuf1       = sbuf1;
2624       smat_i->ptr         = ptr;
2625       smat_i->tmp         = tmp;
2626       smat_i->ctr         = ctr;
2627 
2628       smat_i->pa           = pa;
2629       smat_i->req_size     = req_size;
2630       smat_i->req_source1  = req_source1;
2631 
2632       smat_i->allcolumns  = PETSC_FALSE;
2633       smat_i->singleis    = PETSC_FALSE;
2634       smat_i->row2proc    = NULL;
2635       smat_i->rmap        = NULL;
2636       smat_i->cmap        = NULL;
2637     }
2638 
2639     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2640     ierr = PetscFree(lens);CHKERRQ(ierr);
2641     if (sbuf_aj) {
2642       ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2643       ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2644     }
2645 
2646   } /* endof scall == MAT_INITIAL_MATRIX */
2647 
2648   /* Post recv matrix values */
2649   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2650   ierr = PetscMalloc1(nrqs,&rbuf4);CHKERRQ(ierr);
2651   ierr = PetscMalloc1(nrqs,&r_waits4);CHKERRQ(ierr);
2652   for (i=0; i<nrqs; ++i) {
2653     ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr);
2654     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
2655   }
2656 
2657   /* Allocate sending buffers for a->a, and send them off */
2658   ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr);
2659   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2660   if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);}
2661   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2662 
2663   ierr = PetscMalloc1(nrqr,&s_waits4);CHKERRQ(ierr);
2664   {
2665     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2666     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2667     PetscInt    cend   = C->cmap->rend;
2668     PetscInt    *b_j   = b->j;
2669     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2670 
2671     ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2672     ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2673     for (i=0; i<nrqr; i++) {
2674       rbuf1_i   = rbuf1[i];
2675       sbuf_aa_i = sbuf_aa[i];
2676       ct1       = 2*rbuf1_i[0]+1;
2677       ct2       = 0;
2678       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2679         kmax = rbuf1_i[2*j];
2680         for (k=0; k<kmax; k++,ct1++) {
2681           row    = rbuf1_i[ct1] - rstart;
2682           nzA    = a_i[row+1] - a_i[row];
2683           nzB    = b_i[row+1] - b_i[row];
2684           ncols  = nzA + nzB;
2685           cworkB = b_j + b_i[row];
2686           vworkA = a_a + a_i[row];
2687           vworkB = b_a + b_i[row];
2688 
2689           /* load the column values for this row into vals*/
2690           vals = sbuf_aa_i+ct2;
2691 
2692           lwrite = 0;
2693           for (l=0; l<nzB; l++) {
2694             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2695           }
2696           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2697           for (l=0; l<nzB; l++) {
2698             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2699           }
2700 
2701           ct2 += ncols;
2702         }
2703       }
2704       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
2705     }
2706     ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2707     ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2708   }
2709 
2710   /* Assemble the matrices */
2711   /* First assemble the local rows */
2712   for (i=0; i<ismax; i++) {
2713     row2proc_i = row2proc[i];
2714     subc      = (Mat_SeqAIJ*)submats[i]->data;
2715     imat_ilen = subc->ilen;
2716     imat_j    = subc->j;
2717     imat_i    = subc->i;
2718     imat_a    = subc->a;
2719 
2720     if (!allcolumns[i]) cmap_i = cmap[i];
2721     rmap_i = rmap[i];
2722     irow_i = irow[i];
2723     jmax   = nrow[i];
2724     for (j=0; j<jmax; j++) {
2725       row  = irow_i[j];
2726       proc = row2proc_i[j];
2727       if (proc == rank) {
2728         old_row = row;
2729 #if defined(PETSC_USE_CTABLE)
2730         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2731         row--;
2732 #else
2733         row = rmap_i[row];
2734 #endif
2735         ilen_row = imat_ilen[row];
2736         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2737         mat_i    = imat_i[row];
2738         mat_a    = imat_a + mat_i;
2739         mat_j    = imat_j + mat_i;
2740         if (!allcolumns[i]) {
2741           for (k=0; k<ncols; k++) {
2742 #if defined(PETSC_USE_CTABLE)
2743             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2744 #else
2745             tcol = cmap_i[cols[k]];
2746 #endif
2747             if (tcol) {
2748               *mat_j++ = tcol - 1;
2749               *mat_a++ = vals[k];
2750               ilen_row++;
2751             }
2752           }
2753         } else { /* allcolumns */
2754           for (k=0; k<ncols; k++) {
2755             *mat_j++ = cols[k];  /* global col index! */
2756             *mat_a++ = vals[k];
2757             ilen_row++;
2758           }
2759         }
2760         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2761 
2762         imat_ilen[row] = ilen_row;
2763       }
2764     }
2765   }
2766 
2767   /* Now assemble the off proc rows */
2768   ierr    = MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2769   for (tmp2=0; tmp2<nrqs; tmp2++) {
2770     sbuf1_i = sbuf1[pa[tmp2]];
2771     jmax    = sbuf1_i[0];
2772     ct1     = 2*jmax + 1;
2773     ct2     = 0;
2774     rbuf2_i = rbuf2[tmp2];
2775     rbuf3_i = rbuf3[tmp2];
2776     rbuf4_i = rbuf4[tmp2];
2777     for (j=1; j<=jmax; j++) {
2778       is_no     = sbuf1_i[2*j-1];
2779       rmap_i    = rmap[is_no];
2780       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2781       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2782       imat_ilen = subc->ilen;
2783       imat_j    = subc->j;
2784       imat_i    = subc->i;
2785       imat_a    = subc->a;
2786       max1      = sbuf1_i[2*j];
2787       for (k=0; k<max1; k++,ct1++) {
2788         row = sbuf1_i[ct1];
2789 #if defined(PETSC_USE_CTABLE)
2790         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2791         row--;
2792 #else
2793         row = rmap_i[row];
2794 #endif
2795         ilen  = imat_ilen[row];
2796         mat_i = imat_i[row];
2797         mat_a = imat_a + mat_i;
2798         mat_j = imat_j + mat_i;
2799         max2  = rbuf2_i[ct1];
2800         if (!allcolumns[is_no]) {
2801           for (l=0; l<max2; l++,ct2++) {
2802 #if defined(PETSC_USE_CTABLE)
2803             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2804 #else
2805             tcol = cmap_i[rbuf3_i[ct2]];
2806 #endif
2807             if (tcol) {
2808               *mat_j++ = tcol - 1;
2809               *mat_a++ = rbuf4_i[ct2];
2810               ilen++;
2811             }
2812           }
2813         } else { /* allcolumns */
2814           for (l=0; l<max2; l++,ct2++) {
2815             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2816             *mat_a++ = rbuf4_i[ct2];
2817             ilen++;
2818           }
2819         }
2820         imat_ilen[row] = ilen;
2821       }
2822     }
2823   }
2824 
2825   if (!iscsorted) { /* sort column indices of the rows */
2826     for (i=0; i<ismax; i++) {
2827       subc      = (Mat_SeqAIJ*)submats[i]->data;
2828       imat_j    = subc->j;
2829       imat_i    = subc->i;
2830       imat_a    = subc->a;
2831       imat_ilen = subc->ilen;
2832 
2833       if (allcolumns[i]) continue;
2834       jmax = nrow[i];
2835       for (j=0; j<jmax; j++) {
2836         mat_i = imat_i[j];
2837         mat_a = imat_a + mat_i;
2838         mat_j = imat_j + mat_i;
2839         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2840       }
2841     }
2842   }
2843 
2844   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2845   ierr = MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2846   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2847 
2848   /* Restore the indices */
2849   for (i=0; i<ismax; i++) {
2850     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2851     if (!allcolumns[i]) {
2852       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2853     }
2854   }
2855 
2856   for (i=0; i<ismax; i++) {
2857     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2858     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2859   }
2860 
2861   /* Destroy allocated memory */
2862   if (sbuf_aa) {
2863     ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2864     ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2865   }
2866   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2867 
2868   for (i=0; i<nrqs; ++i) {
2869     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2870   }
2871   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2872 
2873   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2874   PetscFunctionReturn(0);
2875 }
2876 
2877 /*
2878  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2879  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2880  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2881  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2882  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2883  state, and needs to be "assembled" later by compressing B's column space.
2884 
2885  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2886  Following this call, C->A & C->B have been created, even if empty.
2887  */
2888 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2889 {
2890   /* If making this function public, change the error returned in this function away from _PLIB. */
2891   PetscErrorCode ierr;
2892   Mat_MPIAIJ     *aij;
2893   Mat_SeqAIJ     *Baij;
2894   PetscBool      seqaij,Bdisassembled;
2895   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2896   PetscScalar    v;
2897   const PetscInt *rowindices,*colindices;
2898 
2899   PetscFunctionBegin;
2900   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2901   if (A) {
2902     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2903     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2904     if (rowemb) {
2905       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2906       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2907     } else {
2908       if (C->rmap->n != A->rmap->n) {
2909         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2910       }
2911     }
2912     if (dcolemb) {
2913       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2914       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2915     } else {
2916       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2917     }
2918   }
2919   if (B) {
2920     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2921     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2922     if (rowemb) {
2923       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2924       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2925     } else {
2926       if (C->rmap->n != B->rmap->n) {
2927         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2928       }
2929     }
2930     if (ocolemb) {
2931       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2932       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2933     } else {
2934       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2935     }
2936   }
2937 
2938   aij = (Mat_MPIAIJ*)C->data;
2939   if (!aij->A) {
2940     /* Mimic parts of MatMPIAIJSetPreallocation() */
2941     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2942     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2943     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2944     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2945     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2946   }
2947   if (A) {
2948     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2949   } else {
2950     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2951   }
2952   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2953     /*
2954       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2955       need to "disassemble" B -- convert it to using C's global indices.
2956       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2957 
2958       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2959       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2960 
2961       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2962       At least avoid calling MatSetValues() and the implied searches?
2963     */
2964 
2965     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2966 #if defined(PETSC_USE_CTABLE)
2967       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2968 #else
2969       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2970       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2971       if (aij->B) {
2972         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2973       }
2974 #endif
2975       ngcol = 0;
2976       if (aij->lvec) {
2977         ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2978       }
2979       if (aij->garray) {
2980         ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2981         ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2982       }
2983       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2984       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2985     }
2986     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2987       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2988     }
2989     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2990       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2991     }
2992   }
2993   Bdisassembled = PETSC_FALSE;
2994   if (!aij->B) {
2995     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
2996     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
2997     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
2998     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
2999     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3000     Bdisassembled = PETSC_TRUE;
3001   }
3002   if (B) {
3003     Baij = (Mat_SeqAIJ*)B->data;
3004     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3005       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3006       for (i=0; i<B->rmap->n; i++) {
3007         nz[i] = Baij->i[i+1] - Baij->i[i];
3008       }
3009       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3010       ierr = PetscFree(nz);CHKERRQ(ierr);
3011     }
3012 
3013     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3014     shift = rend-rstart;
3015     count = 0;
3016     rowindices = NULL;
3017     colindices = NULL;
3018     if (rowemb) {
3019       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3020     }
3021     if (ocolemb) {
3022       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3023     }
3024     for (i=0; i<B->rmap->n; i++) {
3025       PetscInt row;
3026       row = i;
3027       if (rowindices) row = rowindices[i];
3028       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3029         col  = Baij->j[count];
3030         if (colindices) col = colindices[col];
3031         if (Bdisassembled && col>=rstart) col += shift;
3032         v    = Baij->a[count];
3033         ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3034         ++count;
3035       }
3036     }
3037     /* No assembly for aij->B is necessary. */
3038     /* FIXME: set aij->B's nonzerostate correctly. */
3039   } else {
3040     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3041   }
3042   C->preallocated  = PETSC_TRUE;
3043   C->was_assembled = PETSC_FALSE;
3044   C->assembled     = PETSC_FALSE;
3045    /*
3046       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3047       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3048    */
3049   PetscFunctionReturn(0);
3050 }
3051 
3052 /*
3053   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3054  */
3055 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3056 {
3057   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3058 
3059   PetscFunctionBegin;
3060   PetscValidPointer(A,2);
3061   PetscValidPointer(B,3);
3062   /* FIXME: make sure C is assembled */
3063   *A = aij->A;
3064   *B = aij->B;
3065   /* Note that we don't incref *A and *B, so be careful! */
3066   PetscFunctionReturn(0);
3067 }
3068 
3069 /*
3070   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3071   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3072 */
3073 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3074                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3075                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3076                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3077                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3078 {
3079   PetscErrorCode ierr;
3080   PetscMPIInt    size,flag;
3081   PetscInt       i,ii,cismax,ispar;
3082   Mat            *A,*B;
3083   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3084 
3085   PetscFunctionBegin;
3086   if (!ismax) PetscFunctionReturn(0);
3087 
3088   for (i = 0, cismax = 0; i < ismax; ++i) {
3089     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr);
3090     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3091     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr);
3092     if (size > 1) ++cismax;
3093   }
3094 
3095   /*
3096      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3097      ispar counts the number of parallel ISs across C's comm.
3098   */
3099   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
3100   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3101     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3102     PetscFunctionReturn(0);
3103   }
3104 
3105   /* if (ispar) */
3106   /*
3107     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3108     These are used to extract the off-diag portion of the resulting parallel matrix.
3109     The row IS for the off-diag portion is the same as for the diag portion,
3110     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3111   */
3112   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3113   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3114   for (i = 0, ii = 0; i < ismax; ++i) {
3115     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3116     if (size > 1) {
3117       /*
3118          TODO: This is the part that's ***NOT SCALABLE***.
3119          To fix this we need to extract just the indices of C's nonzero columns
3120          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3121          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3122          to be done without serializing on the IS list, so, most likely, it is best
3123          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3124       */
3125       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3126       /* Now we have to
3127          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3128              were sorted on each rank, concatenated they might no longer be sorted;
3129          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3130              indices in the nondecreasing order to the original index positions.
3131          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3132       */
3133       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3134       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3135       ++ii;
3136     }
3137   }
3138   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3139   for (i = 0, ii = 0; i < ismax; ++i) {
3140     PetscInt       j,issize;
3141     const PetscInt *indices;
3142 
3143     /*
3144        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3145      */
3146     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3147     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3148     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3149     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3150     for (j = 1; j < issize; ++j) {
3151       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3152     }
3153     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3154     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3155     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3156     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3157     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3158     for (j = 1; j < issize; ++j) {
3159       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3160     }
3161     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3162     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3163     if (size > 1) {
3164       cisrow[ii] = isrow[i];
3165       ++ii;
3166     }
3167   }
3168   /*
3169     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3170     array of sequential matrices underlying the resulting parallel matrices.
3171     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3172     contain duplicates.
3173 
3174     There are as many diag matrices as there are original index sets. There are only as many parallel
3175     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3176 
3177     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3178     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3179       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3180       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3181       with A[i] and B[ii] extracted from the corresponding MPI submat.
3182     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3183       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3184       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3185       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3186       values into A[i] and B[ii] sitting inside the corresponding submat.
3187     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3188       A[i], B[ii], AA[i] or BB[ii] matrices.
3189   */
3190   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3191   if (scall == MAT_INITIAL_MATRIX) {
3192     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3193   }
3194 
3195   /* Now obtain the sequential A and B submatrices separately. */
3196   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3197   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3198   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3199 
3200   /*
3201     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3202     matrices A & B have been extracted directly into the parallel matrices containing them, or
3203     simply into the sequential matrix identical with the corresponding A (if size == 1).
3204     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3205     to have the same sparsity pattern.
3206     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3207     must be constructed for C. This is done by setseqmat(s).
3208   */
3209   for (i = 0, ii = 0; i < ismax; ++i) {
3210     /*
3211        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3212        That way we can avoid sorting and computing permutations when reusing.
3213        To this end:
3214         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3215         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3216           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3217     */
3218     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3219 
3220     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3221     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3222     if (size > 1) {
3223       if (scall == MAT_INITIAL_MATRIX) {
3224         ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3225         ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3226         ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3227         ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3228         ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3229       }
3230       /*
3231         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3232       */
3233       {
3234         Mat AA = A[i],BB = B[ii];
3235 
3236         if (AA || BB) {
3237           ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3238           ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3239           ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3240         }
3241         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3242       }
3243       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3244       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3245       ++ii;
3246     } else { /* if (size == 1) */
3247       if (scall == MAT_REUSE_MATRIX) {
3248         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3249       }
3250       if (isrow_p[i] || iscol_p[i]) {
3251         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3252         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3253         /* Otherwise A is extracted straight into (*submats)[i]. */
3254         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3255         ierr = MatDestroy(A+i);CHKERRQ(ierr);
3256       } else (*submat)[i] = A[i];
3257     }
3258     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3259     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3260   }
3261   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3262   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3263   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3264   ierr = PetscFree(A);CHKERRQ(ierr);
3265   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3266   PetscFunctionReturn(0);
3267 }
3268 
3269 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3270 {
3271   PetscErrorCode ierr;
3272 
3273   PetscFunctionBegin;
3274   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3275   PetscFunctionReturn(0);
3276 }
3277