xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision b792a3cda4b57db7765ffeac592e2bedce3f1066)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscErrorCode ierr;
24   PetscInt       i;
25 
26   PetscFunctionBegin;
27   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28   for (i=0; i<ov; ++i) {
29     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
30   }
31   PetscFunctionReturn(0);
32 }
33 
34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
35 {
36   PetscErrorCode ierr;
37   PetscInt       i;
38 
39   PetscFunctionBegin;
40   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
41   for (i=0; i<ov; ++i) {
42     ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr);
43   }
44   PetscFunctionReturn(0);
45 }
46 
47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
48 {
49   PetscErrorCode ierr;
50   MPI_Comm       comm;
51   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
52   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
53   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
54   const PetscInt *indices_i,**indices;
55   PetscLayout    rmap;
56   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
57   PetscSF        sf;
58   PetscSFNode    *remote;
59 
60   PetscFunctionBegin;
61   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
62   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
63   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
64   /* get row map to determine where rows should be going */
65   ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr);
66   /* retrieve IS data and put all together so that we
67    * can optimize communication
68    *  */
69   ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr);
70   for (i=0,tlength=0; i<nidx; i++) {
71     ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr);
72     tlength += length[i];
73     ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr);
74   }
75   /* find these rows on remote processors */
76   ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr);
77   ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr);
78   nrrows = 0;
79   for (i=0; i<nidx; i++) {
80     length_i  = length[i];
81     indices_i = indices[i];
82     for (j=0; j<length_i; j++) {
83       owner = -1;
84       ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr);
85       /* remote processors */
86       if (owner != rank) {
87         tosizes_temp[owner]++; /* number of rows to owner */
88         rrow_ranks[nrrows]  = owner; /* processor */
89         rrow_isids[nrrows]   = i; /* is id */
90         remoterows[nrrows++] = indices_i[j]; /* row */
91       }
92     }
93     ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr);
94   }
95   ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr);
96   /* test if we need to exchange messages
97    * generally speaking, we do not need to exchange
98    * data when overlap is 1
99    * */
100   ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr);
101   /* we do not have any messages
102    * It usually corresponds to overlap 1
103    * */
104   if (!reducednrrows) {
105     ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
106     ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
107     ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
108     PetscFunctionReturn(0);
109   }
110   nto = 0;
111   /* send sizes and ranks for building a two-sided communcation */
112   for (i=0; i<size; i++) {
113     if (tosizes_temp[i]) {
114       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
115       tosizes_temp[i] = nto; /* a map from processor to index */
116       toranks[nto++]  = i; /* processor */
117     }
118   }
119   ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr);
120   toffsets[0] = 0;
121   for (i=0; i<nto; i++) {
122     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
123     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
124   }
125   /* send information to other processors */
126   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr);
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
130   nrecvrows = 0;
131   for (i=0; i<nfrom; i++) {
132     for (j=0; j<fromsizes[2*i]; j++) {
133       remote[nrecvrows].rank    = fromranks[i];
134       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135     }
136   }
137   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
138   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
139   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
140   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
141   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
142   /* message pair <no of is, row>  */
143   ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr);
144   for (i=0; i<nrrows; i++) {
145     owner = rrow_ranks[i]; /* processor */
146     j     = tosizes_temp[owner]; /* index */
147     todata[toffsets[j]++] = rrow_isids[i];
148     todata[toffsets[j]++] = remoterows[i];
149   }
150   ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr);
151   ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr);
152   ierr = PetscFree(toffsets);CHKERRQ(ierr);
153   ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
154   ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr);
155   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
156   /* send rows belonging to the remote so that then we could get the overlapping data back */
157   ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr);
158   ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr);
159   ierr = PetscFree(fromsizes);CHKERRQ(ierr);
160   ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr);
161   ierr = PetscFree(fromranks);CHKERRQ(ierr);
162   nrecvrows = 0;
163   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
164   ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr);
165   ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr);
166   nrecvrows = 0;
167   for (i=0; i<nto; i++) {
168     for (j=0; j<tosizes[2*i]; j++) {
169       remote[nrecvrows].rank    = toranks[i];
170       remote[nrecvrows++].index = tosizes[2*i+1]+j;
171     }
172   }
173   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
174   ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
175   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176   ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);
177   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
178   /* overlap communication and computation */
179   ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
180   ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr);
181   ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr);
182   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
183   ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr);
184   ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr);
185   ierr = PetscFree(toranks);CHKERRQ(ierr);
186   ierr = PetscFree(tosizes);CHKERRQ(ierr);
187   ierr = PetscFree(todata);CHKERRQ(ierr);
188   PetscFunctionReturn(0);
189 }
190 
191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
192 {
193   PetscInt         *isz,isz_i,i,j,is_id, data_size;
194   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
195   const PetscInt   *indices_i_temp;
196   MPI_Comm         *iscomms;
197   PetscErrorCode    ierr;
198 
199   PetscFunctionBegin;
200   max_lsize = 0;
201   ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr);
202   for (i=0; i<nidx; i++) {
203     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
204     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205     isz[i]    = lsize;
206   }
207   ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr);
208   for (i=0; i<nidx; i++) {
209     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
210     ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
211     ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr);
212     ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr);
213     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
214   }
215   /* retrieve information to get row id and its overlap */
216   for (i=0; i<nrecvs;) {
217     is_id     = recvdata[i++];
218     data_size = recvdata[i++];
219     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
220     isz_i     = isz[is_id];
221     for (j=0; j< data_size; j++) {
222       col = recvdata[i++];
223       indices_i[isz_i++] = col;
224     }
225     isz[is_id] = isz_i;
226   }
227   /* remove duplicate entities */
228   for (i=0; i<nidx; i++) {
229     indices_i = indices_temp+(max_lsize+nrecvs)*i;
230     isz_i     = isz[i];
231     ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr);
232     ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
233     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
234   }
235   ierr = PetscFree(isz);CHKERRQ(ierr);
236   ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr);
237   PetscFunctionReturn(0);
238 }
239 
240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
241 {
242   PetscLayout       rmap,cmap;
243   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
244   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
245   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
246   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
247   Mat               amat,bmat;
248   PetscMPIInt       rank;
249   PetscBool         done;
250   MPI_Comm          comm;
251   PetscErrorCode    ierr;
252 
253   PetscFunctionBegin;
254   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
255   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
256   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
257   /* Even if the mat is symmetric, we still assume it is not symmetric */
258   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
259   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
261   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262   /* total number of nonzero values is used to estimate the memory usage in the next step */
263   tnz  = ai[an]+bi[bn];
264   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
265   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
266   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
267   /* to find the longest message */
268   max_fszs = 0;
269   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
270   /* better way to estimate number of nonzero in the mat??? */
271   ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr);
272   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
273   rows_pos  = 0;
274   totalrows = 0;
275   for (i=0; i<nfrom; i++) {
276     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
277     /* group data together */
278     for (j=0; j<fromsizes[2*i]; j+=2) {
279       is_id                       = fromrows[rows_pos++];/* no of is */
280       rows_i                      = rows_data[is_id];
281       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
282     }
283     /* estimate a space to avoid multiple allocations  */
284     for (j=0; j<nidx; j++) {
285       indvc_ij = 0;
286       rows_i   = rows_data[j];
287       for (l=0; l<rows_pos_i[j]; l++) {
288         row    = rows_i[l]-rstart;
289         start  = ai[row];
290         end    = ai[row+1];
291         for (k=start; k<end; k++) { /* Amat */
292           col = aj[k] + cstart;
293           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
294         }
295         start = bi[row];
296         end   = bi[row+1];
297         for (k=start; k<end; k++) { /* Bmat */
298           col = gcols[bj[k]];
299           indices_tmp[indvc_ij++] = col;
300         }
301       }
302       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
303       indv_counts[i*nidx+j] = indvc_ij;
304       totalrows            += indvc_ij;
305     }
306   }
307   /* message triple <no of is, number of rows, rows> */
308   ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr);
309   totalrows = 0;
310   rows_pos  = 0;
311   /* use this code again */
312   for (i=0;i<nfrom;i++) {
313     ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr);
314     for (j=0; j<fromsizes[2*i]; j+=2) {
315       is_id                       = fromrows[rows_pos++];
316       rows_i                      = rows_data[is_id];
317       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
318     }
319     /* add data  */
320     for (j=0; j<nidx; j++) {
321       if (!indv_counts[i*nidx+j]) continue;
322       indvc_ij = 0;
323       sbdata[totalrows++] = j;
324       sbdata[totalrows++] = indv_counts[i*nidx+j];
325       sbsizes[2*i]       += 2;
326       rows_i              = rows_data[j];
327       for (l=0; l<rows_pos_i[j]; l++) {
328         row   = rows_i[l]-rstart;
329         start = ai[row];
330         end   = ai[row+1];
331         for (k=start; k<end; k++) { /* Amat */
332           col = aj[k] + cstart;
333           indices_tmp[indvc_ij++] = col;
334         }
335         start = bi[row];
336         end   = bi[row+1];
337         for (k=start; k<end; k++) { /* Bmat */
338           col = gcols[bj[k]];
339           indices_tmp[indvc_ij++] = col;
340         }
341       }
342       ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr);
343       sbsizes[2*i]  += indvc_ij;
344       ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr);
345       totalrows += indvc_ij;
346     }
347   }
348   ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr);
349   offsets[0] = 0;
350   for (i=0; i<nfrom; i++) {
351     offsets[i+1]   = offsets[i] + sbsizes[2*i];
352     sbsizes[2*i+1] = offsets[i];
353   }
354   ierr = PetscFree(offsets);CHKERRQ(ierr);
355   if (sbrowsizes) *sbrowsizes = sbsizes;
356   if (sbrows) *sbrows = sbdata;
357   ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr);
358   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
359   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364 {
365   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
366   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367   PetscInt          lsize,lsize_tmp;
368   PetscMPIInt       rank,owner;
369   Mat               amat,bmat;
370   PetscBool         done;
371   PetscLayout       cmap,rmap;
372   MPI_Comm          comm;
373   PetscErrorCode    ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
377   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
378   ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr);
379   ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
380   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381   ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
382   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383   /* is it a safe way to compute number of nonzero values ? */
384   tnz  = ai[an]+bi[bn];
385   ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr);
386   ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr);
387   ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr);
388   /* it is a better way to estimate memory than the old implementation
389    * where global size of matrix is used
390    * */
391   ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr);
392   for (i=0; i<nidx; i++) {
393     MPI_Comm iscomm;
394 
395     ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr);
396     ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr);
397     lsize_tmp = 0;
398     for (j=0; j<lsize; j++) {
399       owner = -1;
400       row   = indices[j];
401       ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr);
402       if (owner != rank) continue;
403       /* local number */
404       row  -= rstart;
405       start = ai[row];
406       end   = ai[row+1];
407       for (k=start; k<end; k++) { /* Amat */
408         col = aj[k] + cstart;
409         indices_temp[lsize_tmp++] = col;
410       }
411       start = bi[row];
412       end   = bi[row+1];
413       for (k=start; k<end; k++) { /* Bmat */
414         col = gcols[bj[k]];
415         indices_temp[lsize_tmp++] = col;
416       }
417     }
418    ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr);
419    ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr);
420    ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
421    ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr);
422    ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr);
423    ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr);
424   }
425   ierr = PetscFree(indices_temp);CHKERRQ(ierr);
426   ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr);
427   ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 /*
432   Sample message format:
433   If a processor A wants processor B to process some elements corresponding
434   to index sets is[1],is[5]
435   mesg [0] = 2   (no of index sets in the mesg)
436   -----------
437   mesg [1] = 1 => is[1]
438   mesg [2] = sizeof(is[1]);
439   -----------
440   mesg [3] = 5  => is[5]
441   mesg [4] = sizeof(is[5]);
442   -----------
443   mesg [5]
444   mesg [n]  datas[1]
445   -----------
446   mesg[n+1]
447   mesg[m]  data(is[5])
448   -----------
449 
450   Notes:
451   nrqs - no of requests sent (or to be sent out)
452   nrqr - no of requests received (which have to be or which have been processed)
453 */
454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
455 {
456   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
457   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
458   const PetscInt **idx,*idx_i;
459   PetscInt       *n,**data,len;
460 #if defined(PETSC_USE_CTABLE)
461   PetscTable     *table_data,table_data_i;
462   PetscInt       *tdata,tcount,tcount_max;
463 #else
464   PetscInt       *data_i,*d_p;
465 #endif
466   PetscErrorCode ierr;
467   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
468   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
469   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
470   PetscBT        *table;
471   MPI_Comm       comm;
472   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
473   MPI_Status     *recv_status;
474   MPI_Comm       *iscomms;
475   char           *t_p;
476 
477   PetscFunctionBegin;
478   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
479   size = c->size;
480   rank = c->rank;
481   M    = C->rmap->N;
482 
483   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
484   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
485 
486   ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
487 
488   for (i=0; i<imax; i++) {
489     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
490     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
491   }
492 
493   /* evaluate communication - mesg to who,length of mesg, and buffer space
494      required. Based on this, buffers are allocated, and data copied into them  */
495   ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
496   for (i=0; i<imax; i++) {
497     ierr  = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
498     idx_i = idx[i];
499     len   = n[i];
500     for (j=0; j<len; j++) {
501       row = idx_i[j];
502       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
503       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
504       w4[proc]++;
505     }
506     for (j=0; j<size; j++) {
507       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
508     }
509   }
510 
511   nrqs     = 0;              /* no of outgoing messages */
512   msz      = 0;              /* total mesg length (for all proc */
513   w1[rank] = 0;              /* no mesg sent to intself */
514   w3[rank] = 0;
515   for (i=0; i<size; i++) {
516     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
517   }
518   /* pa - is list of processors to communicate with */
519   ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr);
520   for (i=0,j=0; i<size; i++) {
521     if (w1[i]) {pa[j] = i; j++;}
522   }
523 
524   /* Each message would have a header = 1 + 2*(no of IS) + data */
525   for (i=0; i<nrqs; i++) {
526     j      = pa[i];
527     w1[j] += w2[j] + 2*w3[j];
528     msz   += w1[j];
529   }
530 
531   /* Determine the number of messages to expect, their lengths, from from-ids */
532   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
533   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
534 
535   /* Now post the Irecvs corresponding to these messages */
536   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
537 
538   /* Allocate Memory for outgoing messages */
539   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
540   ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
541   ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
542 
543   {
544     PetscInt *iptr = tmp,ict  = 0;
545     for (i=0; i<nrqs; i++) {
546       j         = pa[i];
547       iptr     +=  ict;
548       outdat[j] = iptr;
549       ict       = w1[j];
550     }
551   }
552 
553   /* Form the outgoing messages */
554   /* plug in the headers */
555   for (i=0; i<nrqs; i++) {
556     j            = pa[i];
557     outdat[j][0] = 0;
558     ierr         = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
559     ptr[j]       = outdat[j] + 2*w3[j] + 1;
560   }
561 
562   /* Memory for doing local proc's work */
563   {
564     PetscInt M_BPB_imax = 0;
565 #if defined(PETSC_USE_CTABLE)
566     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
567     ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr);
568     for (i=0; i<imax; i++) {
569       ierr = PetscTableCreate(n[i],M,&table_data[i]);CHKERRQ(ierr);
570     }
571     ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr);
572     for (i=0; i<imax; i++) {
573       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
574     }
575 #else
576     PetscInt Mimax = 0;
577     ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr);
578     ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr);
579     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr);
580     for (i=0; i<imax; i++) {
581       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582       data[i]  = d_p + M*i;
583     }
584 #endif
585   }
586 
587   /* Parse the IS and update local tables and the outgoing buf with the data */
588   {
589     PetscInt n_i,isz_i,*outdat_j,ctr_j;
590     PetscBT  table_i;
591 
592     for (i=0; i<imax; i++) {
593       ierr    = PetscArrayzero(ctr,size);CHKERRQ(ierr);
594       n_i     = n[i];
595       table_i = table[i];
596       idx_i   = idx[i];
597 #if defined(PETSC_USE_CTABLE)
598       table_data_i = table_data[i];
599 #else
600       data_i  = data[i];
601 #endif
602       isz_i   = isz[i];
603       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
604         row  = idx_i[j];
605         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
606         if (proc != rank) { /* copy to the outgoing buffer */
607           ctr[proc]++;
608           *ptr[proc] = row;
609           ptr[proc]++;
610         } else if (!PetscBTLookupSet(table_i,row)) {
611 #if defined(PETSC_USE_CTABLE)
612           ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
613 #else
614           data_i[isz_i] = row; /* Update the local table */
615 #endif
616           isz_i++;
617         }
618       }
619       /* Update the headers for the current IS */
620       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
621         if ((ctr_j = ctr[j])) {
622           outdat_j        = outdat[j];
623           k               = ++outdat_j[0];
624           outdat_j[2*k]   = ctr_j;
625           outdat_j[2*k-1] = i;
626         }
627       }
628       isz[i] = isz_i;
629     }
630   }
631 
632   /*  Now  post the sends */
633   ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
634   for (i=0; i<nrqs; ++i) {
635     j    = pa[i];
636     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
637   }
638 
639   /* No longer need the original indices */
640   ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
641   for (i=0; i<imax; ++i) {
642     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
643     ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
644   }
645   ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
646 
647   for (i=0; i<imax; ++i) {
648     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
649   }
650 
651   /* Do Local work */
652 #if defined(PETSC_USE_CTABLE)
653   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr);
654 #else
655   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr);
656 #endif
657 
658   /* Receive messages */
659   ierr = PetscMalloc1(nrqr,&recv_status);CHKERRQ(ierr);
660   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);
661   ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
662 
663   /* Phase 1 sends are complete - deallocate buffers */
664   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
665   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
666 
667   ierr = PetscMalloc1(nrqr,&xdata);CHKERRQ(ierr);
668   ierr = PetscMalloc1(nrqr,&isz1);CHKERRQ(ierr);
669   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
670   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
671   ierr = PetscFree(rbuf);CHKERRQ(ierr);
672 
673   /* Send the data back */
674   /* Do a global reduction to know the buffer space req for incoming messages */
675   {
676     PetscMPIInt *rw1;
677 
678     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
679 
680     for (i=0; i<nrqr; ++i) {
681       proc = recv_status[i].MPI_SOURCE;
682 
683       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
684       rw1[proc] = isz1[i];
685     }
686     ierr = PetscFree(onodes1);CHKERRQ(ierr);
687     ierr = PetscFree(olengths1);CHKERRQ(ierr);
688 
689     /* Determine the number of messages to expect, their lengths, from from-ids */
690     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
691     ierr = PetscFree(rw1);CHKERRQ(ierr);
692   }
693   /* Now post the Irecvs corresponding to these messages */
694   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
695 
696   /* Now  post the sends */
697   ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr);
698   for (i=0; i<nrqr; ++i) {
699     j    = recv_status[i].MPI_SOURCE;
700     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr);
701   }
702 
703   /* receive work done on other processors */
704   {
705     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
706     PetscMPIInt idex;
707     PetscBT     table_i;
708 
709     for (i=0; i<nrqs; ++i) {
710       ierr = MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);CHKERRMPI(ierr);
711       /* Process the message */
712       rbuf2_i = rbuf2[idex];
713       ct1     = 2*rbuf2_i[0]+1;
714       jmax    = rbuf2[idex][0];
715       for (j=1; j<=jmax; j++) {
716         max     = rbuf2_i[2*j];
717         is_no   = rbuf2_i[2*j-1];
718         isz_i   = isz[is_no];
719         table_i = table[is_no];
720 #if defined(PETSC_USE_CTABLE)
721         table_data_i = table_data[is_no];
722 #else
723         data_i  = data[is_no];
724 #endif
725         for (k=0; k<max; k++,ct1++) {
726           row = rbuf2_i[ct1];
727           if (!PetscBTLookupSet(table_i,row)) {
728 #if defined(PETSC_USE_CTABLE)
729             ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
730 #else
731             data_i[isz_i] = row;
732 #endif
733             isz_i++;
734           }
735         }
736         isz[is_no] = isz_i;
737       }
738     }
739 
740     ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
741   }
742 
743 #if defined(PETSC_USE_CTABLE)
744   tcount_max = 0;
745   for (i=0; i<imax; ++i) {
746     table_data_i = table_data[i];
747     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
748     if (tcount_max < tcount) tcount_max = tcount;
749   }
750   ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr);
751 #endif
752 
753   for (i=0; i<imax; ++i) {
754 #if defined(PETSC_USE_CTABLE)
755     PetscTablePosition tpos;
756     table_data_i = table_data[i];
757 
758     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
759     while (tpos) {
760       ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr);
761       tdata[--j] = --k;
762     }
763     ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
764 #else
765     ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
766 #endif
767     ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
768   }
769 
770   ierr = PetscFree(iscomms);CHKERRQ(ierr);
771   ierr = PetscFree(onodes2);CHKERRQ(ierr);
772   ierr = PetscFree(olengths2);CHKERRQ(ierr);
773 
774   ierr = PetscFree(pa);CHKERRQ(ierr);
775   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
776   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
777   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
778   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
779   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
780   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
781   ierr = PetscFree(recv_status);CHKERRQ(ierr);
782   if (xdata) {
783     ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
784     ierr = PetscFree(xdata);CHKERRQ(ierr);
785   }
786   ierr = PetscFree(isz1);CHKERRQ(ierr);
787 #if defined(PETSC_USE_CTABLE)
788   for (i=0; i<imax; i++) {
789     ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr);
790   }
791   ierr = PetscFree(table_data);CHKERRQ(ierr);
792   ierr = PetscFree(tdata);CHKERRQ(ierr);
793   ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr);
794 #else
795   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
796 #endif
797   PetscFunctionReturn(0);
798 }
799 
800 /*
801    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
802        the work on the local processor.
803 
804      Inputs:
805       C      - MAT_MPIAIJ;
806       imax - total no of index sets processed at a time;
807       table  - an array of char - size = m bits.
808 
809      Output:
810       isz    - array containing the count of the solution elements corresponding
811                to each index set;
812       data or table_data  - pointer to the solutions
813 */
814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
815 {
816   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
817   Mat        A  = c->A,B = c->B;
818   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
819   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
820   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
821   PetscBT    table_i;
822 #if defined(PETSC_USE_CTABLE)
823   PetscTable         table_data_i;
824   PetscErrorCode     ierr;
825   PetscTablePosition tpos;
826   PetscInt           tcount,*tdata;
827 #else
828   PetscInt           *data_i;
829 #endif
830 
831   PetscFunctionBegin;
832   rstart = C->rmap->rstart;
833   cstart = C->cmap->rstart;
834   ai     = a->i;
835   aj     = a->j;
836   bi     = b->i;
837   bj     = b->j;
838   garray = c->garray;
839 
840   for (i=0; i<imax; i++) {
841 #if defined(PETSC_USE_CTABLE)
842     /* copy existing entries of table_data_i into tdata[] */
843     table_data_i = table_data[i];
844     ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr);
845     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]);
846 
847     ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr);
848     ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr);
849     while (tpos) {
850       ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr);
851       tdata[--j] = --row;
852       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount);
853     }
854 #else
855     data_i  = data[i];
856 #endif
857     table_i = table[i];
858     isz_i   = isz[i];
859     max     = isz[i];
860 
861     for (j=0; j<max; j++) {
862 #if defined(PETSC_USE_CTABLE)
863       row   = tdata[j] - rstart;
864 #else
865       row   = data_i[j] - rstart;
866 #endif
867       start = ai[row];
868       end   = ai[row+1];
869       for (k=start; k<end; k++) { /* Amat */
870         val = aj[k] + cstart;
871         if (!PetscBTLookupSet(table_i,val)) {
872 #if defined(PETSC_USE_CTABLE)
873           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
874 #else
875           data_i[isz_i] = val;
876 #endif
877           isz_i++;
878         }
879       }
880       start = bi[row];
881       end   = bi[row+1];
882       for (k=start; k<end; k++) { /* Bmat */
883         val = garray[bj[k]];
884         if (!PetscBTLookupSet(table_i,val)) {
885 #if defined(PETSC_USE_CTABLE)
886           ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr);
887 #else
888           data_i[isz_i] = val;
889 #endif
890           isz_i++;
891         }
892       }
893     }
894     isz[i] = isz_i;
895 
896 #if defined(PETSC_USE_CTABLE)
897     ierr = PetscFree(tdata);CHKERRQ(ierr);
898 #endif
899   }
900   PetscFunctionReturn(0);
901 }
902 
903 /*
904       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
905          and return the output
906 
907          Input:
908            C    - the matrix
909            nrqr - no of messages being processed.
910            rbuf - an array of pointers to the received requests
911 
912          Output:
913            xdata - array of messages to be sent back
914            isz1  - size of each message
915 
916   For better efficiency perhaps we should malloc separately each xdata[i],
917 then if a remalloc is required we need only copy the data for that one row
918 rather then all previous rows as it is now where a single large chunck of
919 memory is used.
920 
921 */
922 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
923 {
924   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
925   Mat            A  = c->A,B = c->B;
926   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
927   PetscErrorCode ierr;
928   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
929   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
930   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
931   PetscInt       *rbuf_i,kmax,rbuf_0;
932   PetscBT        xtable;
933 
934   PetscFunctionBegin;
935   m      = C->rmap->N;
936   rstart = C->rmap->rstart;
937   cstart = C->cmap->rstart;
938   ai     = a->i;
939   aj     = a->j;
940   bi     = b->i;
941   bj     = b->j;
942   garray = c->garray;
943 
944   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
945     rbuf_i =  rbuf[i];
946     rbuf_0 =  rbuf_i[0];
947     ct    += rbuf_0;
948     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
949   }
950 
951   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
952   else max1 = 1;
953   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
954   if (nrqr) {
955     ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
956     ++no_malloc;
957   }
958   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
959   ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
960 
961   ct3 = 0;
962   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
963     rbuf_i =  rbuf[i];
964     rbuf_0 =  rbuf_i[0];
965     ct1    =  2*rbuf_0+1;
966     ct2    =  ct1;
967     ct3   += ct1;
968     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
969       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
970       oct2 = ct2;
971       kmax = rbuf_i[2*j];
972       for (k=0; k<kmax; k++,ct1++) {
973         row = rbuf_i[ct1];
974         if (!PetscBTLookupSet(xtable,row)) {
975           if (!(ct3 < mem_estimate)) {
976             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
977             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
978             ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
979             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
980             xdata[0]     = tmp;
981             mem_estimate = new_estimate; ++no_malloc;
982             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
983           }
984           xdata[i][ct2++] = row;
985           ct3++;
986         }
987       }
988       for (k=oct2,max2=ct2; k<max2; k++) {
989         row   = xdata[i][k] - rstart;
990         start = ai[row];
991         end   = ai[row+1];
992         for (l=start; l<end; l++) {
993           val = aj[l] + cstart;
994           if (!PetscBTLookupSet(xtable,val)) {
995             if (!(ct3 < mem_estimate)) {
996               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
997               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
998               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
999               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1000               xdata[0]     = tmp;
1001               mem_estimate = new_estimate; ++no_malloc;
1002               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1003             }
1004             xdata[i][ct2++] = val;
1005             ct3++;
1006           }
1007         }
1008         start = bi[row];
1009         end   = bi[row+1];
1010         for (l=start; l<end; l++) {
1011           val = garray[bj[l]];
1012           if (!PetscBTLookupSet(xtable,val)) {
1013             if (!(ct3 < mem_estimate)) {
1014               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1015               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
1016               ierr         = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
1017               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
1018               xdata[0]     = tmp;
1019               mem_estimate = new_estimate; ++no_malloc;
1020               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1021             }
1022             xdata[i][ct2++] = val;
1023             ct3++;
1024           }
1025         }
1026       }
1027       /* Update the header*/
1028       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1029       xdata[i][2*j-1] = rbuf_i[2*j-1];
1030     }
1031     xdata[i][0] = rbuf_0;
1032     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1033     isz1[i]     = ct2; /* size of each message */
1034   }
1035   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
1036   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
1037   PetscFunctionReturn(0);
1038 }
1039 /* -------------------------------------------------------------------------*/
1040 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1041 /*
1042     Every processor gets the entire matrix
1043 */
1044 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1045 {
1046   Mat            B;
1047   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1048   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1049   PetscErrorCode ierr;
1050   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1051   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1052   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1053 
1054   PetscFunctionBegin;
1055   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1056   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr);
1057   if (scall == MAT_INITIAL_MATRIX) {
1058     /* ----------------------------------------------------------------
1059          Tell every processor the number of nonzeros per row
1060     */
1061     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
1062     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1063       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1064     }
1065     ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1066     for (i=0; i<size; i++) {
1067       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1068       displs[i]     = A->rmap->range[i];
1069     }
1070 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1071     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1072 #else
1073     sendcount = A->rmap->rend - A->rmap->rstart;
1074     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1075 #endif
1076     /* ---------------------------------------------------------------
1077          Create the sequential matrix of the same type as the local block diagonal
1078     */
1079     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
1080     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1081     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
1082     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
1083     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
1084     ierr  = PetscCalloc1(2,Bin);CHKERRQ(ierr);
1085     **Bin = B;
1086     b     = (Mat_SeqAIJ*)B->data;
1087 
1088     /*--------------------------------------------------------------------
1089        Copy my part of matrix column indices over
1090     */
1091     sendcount  = ad->nz + bd->nz;
1092     jsendbuf   = b->j + b->i[rstarts[rank]];
1093     a_jsendbuf = ad->j;
1094     b_jsendbuf = bd->j;
1095     n          = A->rmap->rend - A->rmap->rstart;
1096     cnt        = 0;
1097     for (i=0; i<n; i++) {
1098       /* put in lower diagonal portion */
1099       m = bd->i[i+1] - bd->i[i];
1100       while (m > 0) {
1101         /* is it above diagonal (in bd (compressed) numbering) */
1102         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104         m--;
1105       }
1106 
1107       /* put in diagonal portion */
1108       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110       }
1111 
1112       /* put in upper diagonal portion */
1113       while (m-- > 0) {
1114         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115       }
1116     }
1117     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118 
1119     /*--------------------------------------------------------------------
1120        Gather all column indices to all processors
1121     */
1122     for (i=0; i<size; i++) {
1123       recvcounts[i] = 0;
1124       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125         recvcounts[i] += lens[j];
1126       }
1127     }
1128     displs[0] = 0;
1129     for (i=1; i<size; i++) {
1130       displs[i] = displs[i-1] + recvcounts[i-1];
1131     }
1132 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1134 #else
1135     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1136 #endif
1137     /*--------------------------------------------------------------------
1138         Assemble the matrix into useable form (note numerical values not yet set)
1139     */
1140     /* set the b->ilen (length of each row) values */
1141     ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr);
1142     /* set the b->i indices */
1143     b->i[0] = 0;
1144     for (i=1; i<=A->rmap->N; i++) {
1145       b->i[i] = b->i[i-1] + lens[i-1];
1146     }
1147     ierr = PetscFree(lens);CHKERRQ(ierr);
1148     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150 
1151   } else {
1152     B = **Bin;
1153     b = (Mat_SeqAIJ*)B->data;
1154   }
1155 
1156   /*--------------------------------------------------------------------
1157        Copy my part of matrix numerical values into the values location
1158   */
1159   if (flag == MAT_GET_VALUES) {
1160     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1161     MatScalar         *sendbuf,*recvbuf;
1162 
1163     ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr);
1164     ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr);
1165     sendcount = ad->nz + bd->nz;
1166     sendbuf   = b->a + b->i[rstarts[rank]];
1167     a_sendbuf = ada;
1168     b_sendbuf = bda;
1169     b_sendj   = bd->j;
1170     n         = A->rmap->rend - A->rmap->rstart;
1171     cnt       = 0;
1172     for (i=0; i<n; i++) {
1173       /* put in lower diagonal portion */
1174       m = bd->i[i+1] - bd->i[i];
1175       while (m > 0) {
1176         /* is it above diagonal (in bd (compressed) numbering) */
1177         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1178         sendbuf[cnt++] = *b_sendbuf++;
1179         m--;
1180         b_sendj++;
1181       }
1182 
1183       /* put in diagonal portion */
1184       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1185         sendbuf[cnt++] = *a_sendbuf++;
1186       }
1187 
1188       /* put in upper diagonal portion */
1189       while (m-- > 0) {
1190         sendbuf[cnt++] = *b_sendbuf++;
1191         b_sendj++;
1192       }
1193     }
1194     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1195 
1196     /* -----------------------------------------------------------------
1197        Gather all numerical values to all processors
1198     */
1199     if (!recvcounts) {
1200       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
1201     }
1202     for (i=0; i<size; i++) {
1203       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1204     }
1205     displs[0] = 0;
1206     for (i=1; i<size; i++) {
1207       displs[i] = displs[i-1] + recvcounts[i-1];
1208     }
1209     recvbuf = b->a;
1210 #if defined(PETSC_HAVE_MPI_IN_PLACE)
1211     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1212 #else
1213     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr);
1214 #endif
1215     ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr);
1216     ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr);
1217   }  /* endof (flag == MAT_GET_VALUES) */
1218   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
1219 
1220   if (A->symmetric) {
1221     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1222   } else if (A->hermitian) {
1223     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
1224   } else if (A->structurally_symmetric) {
1225     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
1226   }
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1231 {
1232   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1233   Mat            submat,A = c->A,B = c->B;
1234   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1235   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1236   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1237   const PetscInt *icol,*irow;
1238   PetscInt       nrow,ncol,start;
1239   PetscErrorCode ierr;
1240   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1241   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1242   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1243   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1244   PetscInt       *lens,rmax,ncols,*cols,Crow;
1245 #if defined(PETSC_USE_CTABLE)
1246   PetscTable     cmap,rmap;
1247   PetscInt       *cmap_loc,*rmap_loc;
1248 #else
1249   PetscInt       *cmap,*rmap;
1250 #endif
1251   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1252   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1253   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1254   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1255   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1256   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1257   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1258   MPI_Comm       comm;
1259   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1260   PetscMPIInt    *onodes1,*olengths1,idex,end;
1261   Mat_SubSppt    *smatis1;
1262   PetscBool      isrowsorted,iscolsorted;
1263 
1264   PetscFunctionBegin;
1265   PetscValidLogicalCollectiveInt(C,ismax,2);
1266   PetscValidLogicalCollectiveEnum(C,scall,5);
1267   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1268   ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1269   ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1270   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1271   size = c->size;
1272   rank = c->rank;
1273 
1274   ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr);
1275   ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr);
1276   ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr);
1277   ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr);
1278   if (allcolumns) {
1279     icol = NULL;
1280     ncol = C->cmap->N;
1281   } else {
1282     ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr);
1283     ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1284   }
1285 
1286   if (scall == MAT_INITIAL_MATRIX) {
1287     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1288 
1289     /* Get some new tags to keep the communication clean */
1290     tag1 = ((PetscObject)C)->tag;
1291     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
1292     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
1293 
1294     /* evaluate communication - mesg to who, length of mesg, and buffer space
1295      required. Based on this, buffers are allocated, and data copied into them */
1296     ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr);
1297     ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr);
1298 
1299     /* w1[proc] = num of rows owned by proc -- to be requested */
1300     proc = 0;
1301     nrqs = 0; /* num of outgoing messages */
1302     for (j=0; j<nrow; j++) {
1303       row  = irow[j];
1304       if (!isrowsorted) proc = 0;
1305       while (row >= C->rmap->range[proc+1]) proc++;
1306       w1[proc]++;
1307       row2proc[j] = proc; /* map row index to proc */
1308 
1309       if (proc != rank && !w2[proc]) {
1310         w2[proc] = 1; nrqs++;
1311       }
1312     }
1313     w1[rank] = 0;  /* rows owned by self will not be requested */
1314 
1315     ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/
1316     for (proc=0,j=0; proc<size; proc++) {
1317       if (w1[proc]) { pa[j++] = proc;}
1318     }
1319 
1320     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1321     msz = 0;              /* total mesg length (for all procs) */
1322     for (i=0; i<nrqs; i++) {
1323       proc      = pa[i];
1324       w1[proc] += 3;
1325       msz      += w1[proc];
1326     }
1327     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
1328 
1329     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1330     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1331     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
1332 
1333     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1334        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1335     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
1336 
1337     /* Now post the Irecvs corresponding to these messages */
1338     ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
1339 
1340     ierr = PetscFree(onodes1);CHKERRQ(ierr);
1341     ierr = PetscFree(olengths1);CHKERRQ(ierr);
1342 
1343     /* Allocate Memory for outgoing messages */
1344     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
1345     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
1346     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
1347 
1348     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1349     iptr = tmp;
1350     for (i=0; i<nrqs; i++) {
1351       proc        = pa[i];
1352       sbuf1[proc] = iptr;
1353       iptr       += w1[proc];
1354     }
1355 
1356     /* Form the outgoing messages */
1357     /* Initialize the header space */
1358     for (i=0; i<nrqs; i++) {
1359       proc      = pa[i];
1360       ierr      = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr);
1361       ptr[proc] = sbuf1[proc] + 3;
1362     }
1363 
1364     /* Parse the isrow and copy data into outbuf */
1365     ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
1366     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1367       proc = row2proc[j];
1368       if (proc != rank) { /* copy to the outgoing buf*/
1369         *ptr[proc] = irow[j];
1370         ctr[proc]++; ptr[proc]++;
1371       }
1372     }
1373 
1374     /* Update the headers for the current IS */
1375     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1376       if ((ctr_j = ctr[j])) {
1377         sbuf1_j        = sbuf1[j];
1378         k              = ++sbuf1_j[0];
1379         sbuf1_j[2*k]   = ctr_j;
1380         sbuf1_j[2*k-1] = 0;
1381       }
1382     }
1383 
1384     /* Now post the sends */
1385     ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
1386     for (i=0; i<nrqs; ++i) {
1387       proc = pa[i];
1388       ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr);
1389     }
1390 
1391     /* Post Receives to capture the buffer size */
1392     ierr = PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);CHKERRQ(ierr);
1393     ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr);
1394 
1395     if (nrqs) rbuf2[0] = tmp + msz;
1396     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1397 
1398     for (i=0; i<nrqs; ++i) {
1399       proc = pa[i];
1400       ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
1401     }
1402 
1403     ierr = PetscFree2(w1,w2);CHKERRQ(ierr);
1404 
1405     /* Send to other procs the buf size they should allocate */
1406     /* Receive messages*/
1407     ierr = PetscMalloc1(nrqr,&r_status1);CHKERRQ(ierr);
1408     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
1409 
1410     ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
1411     for (i=0; i<nrqr; ++i) {
1412       req_size[i] = 0;
1413       rbuf1_i        = rbuf1[i];
1414       start          = 2*rbuf1_i[0] + 1;
1415       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
1416       ierr           = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr);
1417       sbuf2_i        = sbuf2[i];
1418       for (j=start; j<end; j++) {
1419         k            = rbuf1_i[j] - rstart;
1420         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1421         sbuf2_i[j]   = ncols;
1422         req_size[i] += ncols;
1423       }
1424       req_source1[i] = r_status1[i].MPI_SOURCE;
1425 
1426       /* form the header */
1427       sbuf2_i[0] = req_size[i];
1428       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1429 
1430       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
1431     }
1432 
1433     ierr = PetscFree(r_status1);CHKERRQ(ierr);
1434     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1435 
1436     /* rbuf2 is received, Post recv column indices a->j */
1437     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
1438 
1439     ierr = PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);CHKERRQ(ierr);
1440     for (i=0; i<nrqs; ++i) {
1441       ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr);
1442       req_source2[i] = r_status2[i].MPI_SOURCE;
1443       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
1444     }
1445 
1446     /* Wait on sends1 and sends2 */
1447     ierr = PetscMalloc1(nrqs,&s_status1);CHKERRQ(ierr);
1448     ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);
1449     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1450     ierr = PetscFree(s_status1);CHKERRQ(ierr);
1451 
1452     ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);
1453     ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr);
1454 
1455     /* Now allocate sending buffers for a->j, and send them off */
1456     ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr);
1457     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1458     if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);}
1459     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1460 
1461     for (i=0; i<nrqr; i++) { /* for each requested message */
1462       rbuf1_i   = rbuf1[i];
1463       sbuf_aj_i = sbuf_aj[i];
1464       ct1       = 2*rbuf1_i[0] + 1;
1465       ct2       = 0;
1466       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1467 
1468       kmax = rbuf1[i][2];
1469       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1470         row    = rbuf1_i[ct1] - rstart;
1471         nzA    = ai[row+1] - ai[row];
1472         nzB    = bi[row+1] - bi[row];
1473         ncols  = nzA + nzB;
1474         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1475 
1476         /* load the column indices for this row into cols*/
1477         cols = sbuf_aj_i + ct2;
1478 
1479         lwrite = 0;
1480         for (l=0; l<nzB; l++) {
1481           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1482         }
1483         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1484         for (l=0; l<nzB; l++) {
1485           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1486         }
1487 
1488         ct2 += ncols;
1489       }
1490       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
1491     }
1492 
1493     /* create column map (cmap): global col of C -> local col of submat */
1494 #if defined(PETSC_USE_CTABLE)
1495     if (!allcolumns) {
1496       ierr = PetscTableCreate(ncol,C->cmap->N,&cmap);CHKERRQ(ierr);
1497       ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr);
1498       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1499         if (icol[j] >= cstart && icol[j] <cend) {
1500           cmap_loc[icol[j] - cstart] = j+1;
1501         } else { /* use PetscTable for non-local col indices */
1502           ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1503         }
1504       }
1505     } else {
1506       cmap     = NULL;
1507       cmap_loc = NULL;
1508     }
1509     ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr);
1510 #else
1511     if (!allcolumns) {
1512       ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr);
1513       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1514     } else {
1515       cmap = NULL;
1516     }
1517 #endif
1518 
1519     /* Create lens for MatSeqAIJSetPreallocation() */
1520     ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr);
1521 
1522     /* Compute lens from local part of C */
1523     for (j=0; j<nrow; j++) {
1524       row  = irow[j];
1525       proc = row2proc[j];
1526       if (proc == rank) {
1527         /* diagonal part A = c->A */
1528         ncols = ai[row-rstart+1] - ai[row-rstart];
1529         cols  = aj + ai[row-rstart];
1530         if (!allcolumns) {
1531           for (k=0; k<ncols; k++) {
1532 #if defined(PETSC_USE_CTABLE)
1533             tcol = cmap_loc[cols[k]];
1534 #else
1535             tcol = cmap[cols[k]+cstart];
1536 #endif
1537             if (tcol) lens[j]++;
1538           }
1539         } else { /* allcolumns */
1540           lens[j] = ncols;
1541         }
1542 
1543         /* off-diagonal part B = c->B */
1544         ncols = bi[row-rstart+1] - bi[row-rstart];
1545         cols  = bj + bi[row-rstart];
1546         if (!allcolumns) {
1547           for (k=0; k<ncols; k++) {
1548 #if defined(PETSC_USE_CTABLE)
1549             ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1550 #else
1551             tcol = cmap[bmap[cols[k]]];
1552 #endif
1553             if (tcol) lens[j]++;
1554           }
1555         } else { /* allcolumns */
1556           lens[j] += ncols;
1557         }
1558       }
1559     }
1560 
1561     /* Create row map (rmap): global row of C -> local row of submat */
1562 #if defined(PETSC_USE_CTABLE)
1563     ierr = PetscTableCreate(nrow,C->rmap->N,&rmap);CHKERRQ(ierr);
1564     for (j=0; j<nrow; j++) {
1565       row  = irow[j];
1566       proc = row2proc[j];
1567       if (proc == rank) { /* a local row */
1568         rmap_loc[row - rstart] = j;
1569       } else {
1570         ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1571       }
1572     }
1573 #else
1574     ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr);
1575     for (j=0; j<nrow; j++) {
1576       rmap[irow[j]] = j;
1577     }
1578 #endif
1579 
1580     /* Update lens from offproc data */
1581     /* recv a->j is done */
1582     ierr    = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr);
1583     for (i=0; i<nrqs; i++) {
1584       proc    = pa[i];
1585       sbuf1_i = sbuf1[proc];
1586       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1587       ct1     = 2 + 1;
1588       ct2     = 0;
1589       rbuf2_i = rbuf2[i]; /* received length of C->j */
1590       rbuf3_i = rbuf3[i]; /* received C->j */
1591 
1592       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1593       max1   = sbuf1_i[2];
1594       for (k=0; k<max1; k++,ct1++) {
1595 #if defined(PETSC_USE_CTABLE)
1596         ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1597         row--;
1598         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1599 #else
1600         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1601 #endif
1602         /* Now, store row index of submat in sbuf1_i[ct1] */
1603         sbuf1_i[ct1] = row;
1604 
1605         nnz = rbuf2_i[ct1];
1606         if (!allcolumns) {
1607           for (l=0; l<nnz; l++,ct2++) {
1608 #if defined(PETSC_USE_CTABLE)
1609             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1610               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1611             } else {
1612               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1613             }
1614 #else
1615             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1616 #endif
1617             if (tcol) lens[row]++;
1618           }
1619         } else { /* allcolumns */
1620           lens[row] += nnz;
1621         }
1622       }
1623     }
1624     ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);
1625     ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr);
1626 
1627     /* Create the submatrices */
1628     ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr);
1629     ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1630 
1631     ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr);
1632     ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr);
1633     ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr);
1634     ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr);
1635     ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr);
1636 
1637     /* create struct Mat_SubSppt and attached it to submat */
1638     ierr = PetscNew(&smatis1);CHKERRQ(ierr);
1639     subc = (Mat_SeqAIJ*)submat->data;
1640     subc->submatis1 = smatis1;
1641 
1642     smatis1->id          = 0;
1643     smatis1->nrqs        = nrqs;
1644     smatis1->nrqr        = nrqr;
1645     smatis1->rbuf1       = rbuf1;
1646     smatis1->rbuf2       = rbuf2;
1647     smatis1->rbuf3       = rbuf3;
1648     smatis1->sbuf2       = sbuf2;
1649     smatis1->req_source2 = req_source2;
1650 
1651     smatis1->sbuf1       = sbuf1;
1652     smatis1->ptr         = ptr;
1653     smatis1->tmp         = tmp;
1654     smatis1->ctr         = ctr;
1655 
1656     smatis1->pa           = pa;
1657     smatis1->req_size     = req_size;
1658     smatis1->req_source1  = req_source1;
1659 
1660     smatis1->allcolumns  = allcolumns;
1661     smatis1->singleis    = PETSC_TRUE;
1662     smatis1->row2proc    = row2proc;
1663     smatis1->rmap        = rmap;
1664     smatis1->cmap        = cmap;
1665 #if defined(PETSC_USE_CTABLE)
1666     smatis1->rmap_loc    = rmap_loc;
1667     smatis1->cmap_loc    = cmap_loc;
1668 #endif
1669 
1670     smatis1->destroy     = submat->ops->destroy;
1671     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1672     submat->factortype   = C->factortype;
1673 
1674     /* compute rmax */
1675     rmax = 0;
1676     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1677 
1678   } else { /* scall == MAT_REUSE_MATRIX */
1679     submat = submats[0];
1680     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1681 
1682     subc    = (Mat_SeqAIJ*)submat->data;
1683     rmax    = subc->rmax;
1684     smatis1 = subc->submatis1;
1685     nrqs        = smatis1->nrqs;
1686     nrqr        = smatis1->nrqr;
1687     rbuf1       = smatis1->rbuf1;
1688     rbuf2       = smatis1->rbuf2;
1689     rbuf3       = smatis1->rbuf3;
1690     req_source2 = smatis1->req_source2;
1691 
1692     sbuf1     = smatis1->sbuf1;
1693     sbuf2     = smatis1->sbuf2;
1694     ptr       = smatis1->ptr;
1695     tmp       = smatis1->tmp;
1696     ctr       = smatis1->ctr;
1697 
1698     pa         = smatis1->pa;
1699     req_size   = smatis1->req_size;
1700     req_source1 = smatis1->req_source1;
1701 
1702     allcolumns = smatis1->allcolumns;
1703     row2proc   = smatis1->row2proc;
1704     rmap       = smatis1->rmap;
1705     cmap       = smatis1->cmap;
1706 #if defined(PETSC_USE_CTABLE)
1707     rmap_loc   = smatis1->rmap_loc;
1708     cmap_loc   = smatis1->cmap_loc;
1709 #endif
1710   }
1711 
1712   /* Post recv matrix values */
1713   ierr = PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr);
1714   ierr = PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);CHKERRQ(ierr);
1715   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1716   for (i=0; i<nrqs; ++i) {
1717     ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr);
1718     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
1719   }
1720 
1721   /* Allocate sending buffers for a->a, and send them off */
1722   ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr);
1723   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1724   if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);}
1725   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1726 
1727   for (i=0; i<nrqr; i++) {
1728     rbuf1_i   = rbuf1[i];
1729     sbuf_aa_i = sbuf_aa[i];
1730     ct1       = 2*rbuf1_i[0]+1;
1731     ct2       = 0;
1732     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1733 
1734     kmax = rbuf1_i[2];
1735     for (k=0; k<kmax; k++,ct1++) {
1736       row = rbuf1_i[ct1] - rstart;
1737       nzA = ai[row+1] - ai[row];
1738       nzB = bi[row+1] - bi[row];
1739       ncols  = nzA + nzB;
1740       cworkB = bj + bi[row];
1741       vworkA = a_a + ai[row];
1742       vworkB = b_a + bi[row];
1743 
1744       /* load the column values for this row into vals*/
1745       vals = sbuf_aa_i + ct2;
1746 
1747       lwrite = 0;
1748       for (l=0; l<nzB; l++) {
1749         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1750       }
1751       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1752       for (l=0; l<nzB; l++) {
1753         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1754       }
1755 
1756       ct2 += ncols;
1757     }
1758     ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
1759   }
1760 
1761   /* Assemble submat */
1762   /* First assemble the local rows */
1763   for (j=0; j<nrow; j++) {
1764     row  = irow[j];
1765     proc = row2proc[j];
1766     if (proc == rank) {
1767       Crow = row - rstart;  /* local row index of C */
1768 #if defined(PETSC_USE_CTABLE)
1769       row = rmap_loc[Crow]; /* row index of submat */
1770 #else
1771       row = rmap[row];
1772 #endif
1773 
1774       if (allcolumns) {
1775         /* diagonal part A = c->A */
1776         ncols = ai[Crow+1] - ai[Crow];
1777         cols  = aj + ai[Crow];
1778         vals  = a_a + ai[Crow];
1779         i     = 0;
1780         for (k=0; k<ncols; k++) {
1781           subcols[i]   = cols[k] + cstart;
1782           subvals[i++] = vals[k];
1783         }
1784 
1785         /* off-diagonal part B = c->B */
1786         ncols = bi[Crow+1] - bi[Crow];
1787         cols  = bj + bi[Crow];
1788         vals  = b_a + bi[Crow];
1789         for (k=0; k<ncols; k++) {
1790           subcols[i]   = bmap[cols[k]];
1791           subvals[i++] = vals[k];
1792         }
1793 
1794         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1795 
1796       } else { /* !allcolumns */
1797 #if defined(PETSC_USE_CTABLE)
1798         /* diagonal part A = c->A */
1799         ncols = ai[Crow+1] - ai[Crow];
1800         cols  = aj + ai[Crow];
1801         vals  = a_a + ai[Crow];
1802         i     = 0;
1803         for (k=0; k<ncols; k++) {
1804           tcol = cmap_loc[cols[k]];
1805           if (tcol) {
1806             subcols[i]   = --tcol;
1807             subvals[i++] = vals[k];
1808           }
1809         }
1810 
1811         /* off-diagonal part B = c->B */
1812         ncols = bi[Crow+1] - bi[Crow];
1813         cols  = bj + bi[Crow];
1814         vals  = b_a + bi[Crow];
1815         for (k=0; k<ncols; k++) {
1816           ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr);
1817           if (tcol) {
1818             subcols[i]   = --tcol;
1819             subvals[i++] = vals[k];
1820           }
1821         }
1822 #else
1823         /* diagonal part A = c->A */
1824         ncols = ai[Crow+1] - ai[Crow];
1825         cols  = aj + ai[Crow];
1826         vals  = a_a + ai[Crow];
1827         i     = 0;
1828         for (k=0; k<ncols; k++) {
1829           tcol = cmap[cols[k]+cstart];
1830           if (tcol) {
1831             subcols[i]   = --tcol;
1832             subvals[i++] = vals[k];
1833           }
1834         }
1835 
1836         /* off-diagonal part B = c->B */
1837         ncols = bi[Crow+1] - bi[Crow];
1838         cols  = bj + bi[Crow];
1839         vals  = b_a + bi[Crow];
1840         for (k=0; k<ncols; k++) {
1841           tcol = cmap[bmap[cols[k]]];
1842           if (tcol) {
1843             subcols[i]   = --tcol;
1844             subvals[i++] = vals[k];
1845           }
1846         }
1847 #endif
1848         ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1849       }
1850     }
1851   }
1852 
1853   /* Now assemble the off-proc rows */
1854   for (i=0; i<nrqs; i++) { /* for each requested message */
1855     /* recv values from other processes */
1856     ierr    = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr);
1857     proc    = pa[idex];
1858     sbuf1_i = sbuf1[proc];
1859     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1860     ct1     = 2 + 1;
1861     ct2     = 0; /* count of received C->j */
1862     ct3     = 0; /* count of received C->j that will be inserted into submat */
1863     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1864     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1865     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1866 
1867     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1868     max1 = sbuf1_i[2];             /* num of rows */
1869     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1870       row = sbuf1_i[ct1]; /* row index of submat */
1871       if (!allcolumns) {
1872         idex = 0;
1873         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1874           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1875           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1876 #if defined(PETSC_USE_CTABLE)
1877             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1878               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1879             } else {
1880               ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1881             }
1882 #else
1883             tcol = cmap[rbuf3_i[ct2]];
1884 #endif
1885             if (tcol) {
1886               subcols[idex]   = --tcol; /* may not be sorted */
1887               subvals[idex++] = rbuf4_i[ct2];
1888 
1889               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1890                For reuse, we replace received C->j with index that should be inserted to submat */
1891               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1892             }
1893           }
1894           ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr);
1895         } else { /* scall == MAT_REUSE_MATRIX */
1896           submat = submats[0];
1897           subc   = (Mat_SeqAIJ*)submat->data;
1898 
1899           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1900           for (l=0; l<nnz; l++) {
1901             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1902             subvals[idex++] = rbuf4_i[ct2];
1903           }
1904 
1905           bj = subc->j + subc->i[row]; /* sorted column indices */
1906           ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr);
1907         }
1908       } else { /* allcolumns */
1909         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1910         ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr);
1911         ct2 += nnz;
1912       }
1913     }
1914   }
1915 
1916   /* sending a->a are done */
1917   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);
1918   ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr);
1919 
1920   ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1921   ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1922   submats[0] = submat;
1923 
1924   /* Restore the indices */
1925   ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr);
1926   if (!allcolumns) {
1927     ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr);
1928   }
1929 
1930   /* Destroy allocated memory */
1931   for (i=0; i<nrqs; ++i) {
1932     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1933   }
1934   ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr);
1935   if (sbuf_aa) {
1936     ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1937     ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1938   }
1939 
1940   if (scall == MAT_INITIAL_MATRIX) {
1941     ierr = PetscFree(lens);CHKERRQ(ierr);
1942     if (sbuf_aj) {
1943       ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1944       ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1945     }
1946   }
1947   ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
1948   ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
1949   PetscFunctionReturn(0);
1950 }
1951 
1952 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1953 {
1954   PetscErrorCode ierr;
1955   PetscInt       ncol;
1956   PetscBool      colflag,allcolumns=PETSC_FALSE;
1957 
1958   PetscFunctionBegin;
1959   /* Allocate memory to hold all the submatrices */
1960   if (scall == MAT_INITIAL_MATRIX) {
1961     ierr = PetscCalloc1(2,submat);CHKERRQ(ierr);
1962   }
1963 
1964   /* Check for special case: each processor gets entire matrix columns */
1965   ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr);
1966   ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr);
1967   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1968 
1969   ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr);
1970   PetscFunctionReturn(0);
1971 }
1972 
1973 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1974 {
1975   PetscErrorCode ierr;
1976   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1977   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1978   Mat_SeqAIJ     *subc;
1979   Mat_SubSppt    *smat;
1980 
1981   PetscFunctionBegin;
1982   /* Check for special case: each processor has a single IS */
1983   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1984     ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1985     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1986     PetscFunctionReturn(0);
1987   }
1988 
1989   /* Collect global wantallmatrix and nstages */
1990   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1991   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1992   if (!nmax) nmax = 1;
1993 
1994   if (scall == MAT_INITIAL_MATRIX) {
1995     /* Collect global wantallmatrix and nstages */
1996     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1997       ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
1998       ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
1999       ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
2000       ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
2001       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2002         wantallmatrix = PETSC_TRUE;
2003 
2004         ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
2005       }
2006     }
2007 
2008     /* Determine the number of stages through which submatrices are done
2009        Each stage will extract nmax submatrices.
2010        nmax is determined by the matrix column dimension.
2011        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2012     */
2013     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2014 
2015     in[0] = -1*(PetscInt)wantallmatrix;
2016     in[1] = nstages;
2017     ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
2018     wantallmatrix = (PetscBool)(-out[0]);
2019     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
2020 
2021   } else { /* MAT_REUSE_MATRIX */
2022     if (ismax) {
2023       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2024       smat = subc->submatis1;
2025     } else { /* (*submat)[0] is a dummy matrix */
2026       smat = (Mat_SubSppt*)(*submat)[0]->data;
2027     }
2028     if (!smat) {
2029       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2030       wantallmatrix = PETSC_TRUE;
2031     } else if (smat->singleis) {
2032       ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
2033       PetscFunctionReturn(0);
2034     } else {
2035       nstages = smat->nstages;
2036     }
2037   }
2038 
2039   if (wantallmatrix) {
2040     ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
2041     PetscFunctionReturn(0);
2042   }
2043 
2044   /* Allocate memory to hold all the submatrices and dummy submatrices */
2045   if (scall == MAT_INITIAL_MATRIX) {
2046     ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
2047   }
2048 
2049   for (i=0,pos=0; i<nstages; i++) {
2050     if (pos+nmax <= ismax) max_no = nmax;
2051     else if (pos >= ismax) max_no = 0;
2052     else                   max_no = ismax-pos;
2053 
2054     ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
2055     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2056       smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2057       smat->nstages = nstages;
2058     }
2059     pos += max_no;
2060   }
2061 
2062   if (ismax && scall == MAT_INITIAL_MATRIX) {
2063     /* save nstages for reuse */
2064     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2065     smat = subc->submatis1;
2066     smat->nstages = nstages;
2067   }
2068   PetscFunctionReturn(0);
2069 }
2070 
2071 /* -------------------------------------------------------------------------*/
2072 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2073 {
2074   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2075   Mat            A  = c->A;
2076   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2077   const PetscInt **icol,**irow;
2078   PetscInt       *nrow,*ncol,start;
2079   PetscErrorCode ierr;
2080   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2081   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2082   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2083   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2084   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2085 #if defined(PETSC_USE_CTABLE)
2086   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2087 #else
2088   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2089 #endif
2090   const PetscInt *irow_i;
2091   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2092   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2093   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2094   MPI_Status     *r_status1,*r_status2;
2095   MPI_Comm       comm;
2096   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2097   PetscMPIInt    *onodes1,*olengths1,end;
2098   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2099   Mat_SubSppt    *smat_i;
2100   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2101   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2102 
2103   PetscFunctionBegin;
2104   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2105   size = c->size;
2106   rank = c->rank;
2107 
2108   ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr);
2109   ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
2110 
2111   for (i=0; i<ismax; i++) {
2112     ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
2113     if (!issorted[i]) iscsorted = issorted[i];
2114 
2115     ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
2116 
2117     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
2118     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
2119 
2120     /* Check for special case: allcolumn */
2121     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
2122     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
2123     if (colflag && ncol[i] == C->cmap->N) {
2124       allcolumns[i] = PETSC_TRUE;
2125       icol[i] = NULL;
2126     } else {
2127       allcolumns[i] = PETSC_FALSE;
2128       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
2129     }
2130   }
2131 
2132   if (scall == MAT_REUSE_MATRIX) {
2133     /* Assumes new rows are same length as the old rows */
2134     for (i=0; i<ismax; i++) {
2135       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2136       subc = (Mat_SeqAIJ*)submats[i]->data;
2137       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2138 
2139       /* Initial matrix as if empty */
2140       ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr);
2141 
2142       smat_i   = subc->submatis1;
2143 
2144       nrqs        = smat_i->nrqs;
2145       nrqr        = smat_i->nrqr;
2146       rbuf1       = smat_i->rbuf1;
2147       rbuf2       = smat_i->rbuf2;
2148       rbuf3       = smat_i->rbuf3;
2149       req_source2 = smat_i->req_source2;
2150 
2151       sbuf1     = smat_i->sbuf1;
2152       sbuf2     = smat_i->sbuf2;
2153       ptr       = smat_i->ptr;
2154       tmp       = smat_i->tmp;
2155       ctr       = smat_i->ctr;
2156 
2157       pa          = smat_i->pa;
2158       req_size    = smat_i->req_size;
2159       req_source1 = smat_i->req_source1;
2160 
2161       allcolumns[i] = smat_i->allcolumns;
2162       row2proc[i]   = smat_i->row2proc;
2163       rmap[i]       = smat_i->rmap;
2164       cmap[i]       = smat_i->cmap;
2165     }
2166 
2167     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2168       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2169       smat_i = (Mat_SubSppt*)submats[0]->data;
2170 
2171       nrqs        = smat_i->nrqs;
2172       nrqr        = smat_i->nrqr;
2173       rbuf1       = smat_i->rbuf1;
2174       rbuf2       = smat_i->rbuf2;
2175       rbuf3       = smat_i->rbuf3;
2176       req_source2 = smat_i->req_source2;
2177 
2178       sbuf1       = smat_i->sbuf1;
2179       sbuf2       = smat_i->sbuf2;
2180       ptr         = smat_i->ptr;
2181       tmp         = smat_i->tmp;
2182       ctr         = smat_i->ctr;
2183 
2184       pa          = smat_i->pa;
2185       req_size    = smat_i->req_size;
2186       req_source1 = smat_i->req_source1;
2187 
2188       allcolumns[0] = PETSC_FALSE;
2189     }
2190   } else { /* scall == MAT_INITIAL_MATRIX */
2191     /* Get some new tags to keep the communication clean */
2192     ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
2193     ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
2194 
2195     /* evaluate communication - mesg to who, length of mesg, and buffer space
2196      required. Based on this, buffers are allocated, and data copied into them*/
2197     ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size, initialize work vectors */
2198 
2199     for (i=0; i<ismax; i++) {
2200       jmax   = nrow[i];
2201       irow_i = irow[i];
2202 
2203       ierr   = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
2204       row2proc[i] = row2proc_i;
2205 
2206       if (issorted[i]) proc = 0;
2207       for (j=0; j<jmax; j++) {
2208         if (!issorted[i]) proc = 0;
2209         row = irow_i[j];
2210         while (row >= C->rmap->range[proc+1]) proc++;
2211         w4[proc]++;
2212         row2proc_i[j] = proc; /* map row index to proc */
2213       }
2214       for (j=0; j<size; j++) {
2215         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2216       }
2217     }
2218 
2219     nrqs     = 0;              /* no of outgoing messages */
2220     msz      = 0;              /* total mesg length (for all procs) */
2221     w1[rank] = 0;              /* no mesg sent to self */
2222     w3[rank] = 0;
2223     for (i=0; i<size; i++) {
2224       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2225     }
2226     ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/
2227     for (i=0,j=0; i<size; i++) {
2228       if (w1[i]) { pa[j] = i; j++; }
2229     }
2230 
2231     /* Each message would have a header = 1 + 2*(no of IS) + data */
2232     for (i=0; i<nrqs; i++) {
2233       j      = pa[i];
2234       w1[j] += w2[j] + 2* w3[j];
2235       msz   += w1[j];
2236     }
2237     ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
2238 
2239     /* Determine the number of messages to expect, their lengths, from from-ids */
2240     ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
2241     ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
2242 
2243     /* Now post the Irecvs corresponding to these messages */
2244     tag0 = ((PetscObject)C)->tag;
2245     ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
2246 
2247     ierr = PetscFree(onodes1);CHKERRQ(ierr);
2248     ierr = PetscFree(olengths1);CHKERRQ(ierr);
2249 
2250     /* Allocate Memory for outgoing messages */
2251     ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
2252     ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
2253     ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
2254 
2255     {
2256       PetscInt *iptr = tmp;
2257       k    = 0;
2258       for (i=0; i<nrqs; i++) {
2259         j        = pa[i];
2260         iptr    += k;
2261         sbuf1[j] = iptr;
2262         k        = w1[j];
2263       }
2264     }
2265 
2266     /* Form the outgoing messages. Initialize the header space */
2267     for (i=0; i<nrqs; i++) {
2268       j           = pa[i];
2269       sbuf1[j][0] = 0;
2270       ierr        = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
2271       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2272     }
2273 
2274     /* Parse the isrow and copy data into outbuf */
2275     for (i=0; i<ismax; i++) {
2276       row2proc_i = row2proc[i];
2277       ierr   = PetscArrayzero(ctr,size);CHKERRQ(ierr);
2278       irow_i = irow[i];
2279       jmax   = nrow[i];
2280       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2281         proc = row2proc_i[j];
2282         if (proc != rank) { /* copy to the outgoing buf*/
2283           ctr[proc]++;
2284           *ptr[proc] = irow_i[j];
2285           ptr[proc]++;
2286         }
2287       }
2288       /* Update the headers for the current IS */
2289       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2290         if ((ctr_j = ctr[j])) {
2291           sbuf1_j        = sbuf1[j];
2292           k              = ++sbuf1_j[0];
2293           sbuf1_j[2*k]   = ctr_j;
2294           sbuf1_j[2*k-1] = i;
2295         }
2296       }
2297     }
2298 
2299     /*  Now  post the sends */
2300     ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr);
2301     for (i=0; i<nrqs; ++i) {
2302       j    = pa[i];
2303       ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr);
2304     }
2305 
2306     /* Post Receives to capture the buffer size */
2307     ierr = PetscMalloc1(nrqs,&r_waits2);CHKERRQ(ierr);
2308     ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr);
2309     if (nrqs) rbuf2[0] = tmp + msz;
2310     for (i=1; i<nrqs; ++i) {
2311       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2312     }
2313     for (i=0; i<nrqs; ++i) {
2314       j    = pa[i];
2315       ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr);
2316     }
2317 
2318     /* Send to other procs the buf size they should allocate */
2319     /* Receive messages*/
2320     ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr);
2321     ierr = PetscMalloc1(nrqr,&r_status1);CHKERRQ(ierr);
2322     ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
2323     {
2324       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2325       PetscInt   *sbuf2_i;
2326 
2327       ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr);
2328       for (i=0; i<nrqr; ++i) {
2329         req_size[i] = 0;
2330         rbuf1_i        = rbuf1[i];
2331         start          = 2*rbuf1_i[0] + 1;
2332         ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr);
2333         ierr           = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr);
2334         sbuf2_i        = sbuf2[i];
2335         for (j=start; j<end; j++) {
2336           id              = rbuf1_i[j] - rstart;
2337           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2338           sbuf2_i[j]      = ncols;
2339           req_size[i] += ncols;
2340         }
2341         req_source1[i] = r_status1[i].MPI_SOURCE;
2342         /* form the header */
2343         sbuf2_i[0] = req_size[i];
2344         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2345 
2346         ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr);
2347       }
2348     }
2349     ierr = PetscFree(r_status1);CHKERRQ(ierr);
2350     ierr = PetscFree(r_waits1);CHKERRQ(ierr);
2351     ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
2352 
2353     /* Receive messages*/
2354     ierr = PetscMalloc1(nrqs,&r_waits3);CHKERRQ(ierr);
2355     ierr = PetscMalloc1(nrqs,&r_status2);CHKERRQ(ierr);
2356 
2357     ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr);
2358     for (i=0; i<nrqs; ++i) {
2359       ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr);
2360       req_source2[i] = r_status2[i].MPI_SOURCE;
2361       ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr);
2362     }
2363     ierr = PetscFree(r_status2);CHKERRQ(ierr);
2364     ierr = PetscFree(r_waits2);CHKERRQ(ierr);
2365 
2366     /* Wait on sends1 and sends2 */
2367     ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2368     ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2369     ierr = PetscFree(s_waits1);CHKERRQ(ierr);
2370     ierr = PetscFree(s_waits2);CHKERRQ(ierr);
2371 
2372     /* Now allocate sending buffers for a->j, and send them off */
2373     ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr);
2374     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2375     if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);}
2376     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2377 
2378     ierr = PetscMalloc1(nrqr,&s_waits3);CHKERRQ(ierr);
2379     {
2380       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2381       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2382       PetscInt cend = C->cmap->rend;
2383       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2384 
2385       for (i=0; i<nrqr; i++) {
2386         rbuf1_i   = rbuf1[i];
2387         sbuf_aj_i = sbuf_aj[i];
2388         ct1       = 2*rbuf1_i[0] + 1;
2389         ct2       = 0;
2390         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2391           kmax = rbuf1[i][2*j];
2392           for (k=0; k<kmax; k++,ct1++) {
2393             row    = rbuf1_i[ct1] - rstart;
2394             nzA    = a_i[row+1] - a_i[row];
2395             nzB    = b_i[row+1] - b_i[row];
2396             ncols  = nzA + nzB;
2397             cworkA = a_j + a_i[row];
2398             cworkB = b_j + b_i[row];
2399 
2400             /* load the column indices for this row into cols */
2401             cols = sbuf_aj_i + ct2;
2402 
2403             lwrite = 0;
2404             for (l=0; l<nzB; l++) {
2405               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2406             }
2407             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2408             for (l=0; l<nzB; l++) {
2409               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2410             }
2411 
2412             ct2 += ncols;
2413           }
2414         }
2415         ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr);
2416       }
2417     }
2418 
2419     /* create col map: global col of C -> local col of submatrices */
2420     {
2421       const PetscInt *icol_i;
2422 #if defined(PETSC_USE_CTABLE)
2423       for (i=0; i<ismax; i++) {
2424         if (!allcolumns[i]) {
2425           ierr = PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2426 
2427           jmax   = ncol[i];
2428           icol_i = icol[i];
2429           cmap_i = cmap[i];
2430           for (j=0; j<jmax; j++) {
2431             ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2432           }
2433         } else cmap[i] = NULL;
2434       }
2435 #else
2436       for (i=0; i<ismax; i++) {
2437         if (!allcolumns[i]) {
2438           ierr   = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
2439           jmax   = ncol[i];
2440           icol_i = icol[i];
2441           cmap_i = cmap[i];
2442           for (j=0; j<jmax; j++) {
2443             cmap_i[icol_i[j]] = j+1;
2444           }
2445         } else cmap[i] = NULL;
2446       }
2447 #endif
2448     }
2449 
2450     /* Create lens which is required for MatCreate... */
2451     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2452     ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
2453 
2454     if (ismax) {
2455       ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
2456     }
2457     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2458 
2459     /* Update lens from local data */
2460     for (i=0; i<ismax; i++) {
2461       row2proc_i = row2proc[i];
2462       jmax = nrow[i];
2463       if (!allcolumns[i]) cmap_i = cmap[i];
2464       irow_i = irow[i];
2465       lens_i = lens[i];
2466       for (j=0; j<jmax; j++) {
2467         row = irow_i[j];
2468         proc = row2proc_i[j];
2469         if (proc == rank) {
2470           ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2471           if (!allcolumns[i]) {
2472             for (k=0; k<ncols; k++) {
2473 #if defined(PETSC_USE_CTABLE)
2474               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2475 #else
2476               tcol = cmap_i[cols[k]];
2477 #endif
2478               if (tcol) lens_i[j]++;
2479             }
2480           } else { /* allcolumns */
2481             lens_i[j] = ncols;
2482           }
2483           ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr);
2484         }
2485       }
2486     }
2487 
2488     /* Create row map: global row of C -> local row of submatrices */
2489 #if defined(PETSC_USE_CTABLE)
2490     for (i=0; i<ismax; i++) {
2491       ierr   = PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2492       irow_i = irow[i];
2493       jmax   = nrow[i];
2494       for (j=0; j<jmax; j++) {
2495       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
2496       }
2497     }
2498 #else
2499     for (i=0; i<ismax; i++) {
2500       ierr   = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr);
2501       rmap_i = rmap[i];
2502       irow_i = irow[i];
2503       jmax   = nrow[i];
2504       for (j=0; j<jmax; j++) {
2505         rmap_i[irow_i[j]] = j;
2506       }
2507     }
2508 #endif
2509 
2510     /* Update lens from offproc data */
2511     {
2512       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2513 
2514       ierr    = MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2515       for (tmp2=0; tmp2<nrqs; tmp2++) {
2516         sbuf1_i = sbuf1[pa[tmp2]];
2517         jmax    = sbuf1_i[0];
2518         ct1     = 2*jmax+1;
2519         ct2     = 0;
2520         rbuf2_i = rbuf2[tmp2];
2521         rbuf3_i = rbuf3[tmp2];
2522         for (j=1; j<=jmax; j++) {
2523           is_no  = sbuf1_i[2*j-1];
2524           max1   = sbuf1_i[2*j];
2525           lens_i = lens[is_no];
2526           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2527           rmap_i = rmap[is_no];
2528           for (k=0; k<max1; k++,ct1++) {
2529 #if defined(PETSC_USE_CTABLE)
2530             ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
2531             row--;
2532             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2533 #else
2534             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2535 #endif
2536             max2 = rbuf2_i[ct1];
2537             for (l=0; l<max2; l++,ct2++) {
2538               if (!allcolumns[is_no]) {
2539 #if defined(PETSC_USE_CTABLE)
2540                 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2541 #else
2542                 tcol = cmap_i[rbuf3_i[ct2]];
2543 #endif
2544                 if (tcol) lens_i[row]++;
2545               } else { /* allcolumns */
2546                 lens_i[row]++; /* lens_i[row] += max2 ? */
2547               }
2548             }
2549           }
2550         }
2551       }
2552     }
2553     ierr = PetscFree(r_waits3);CHKERRQ(ierr);
2554     ierr = MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2555     ierr = PetscFree(s_waits3);CHKERRQ(ierr);
2556 
2557     /* Create the submatrices */
2558     for (i=0; i<ismax; i++) {
2559       PetscInt    rbs,cbs;
2560 
2561       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
2562       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
2563 
2564       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
2565       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2566 
2567       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
2568       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
2569       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
2570 
2571       /* create struct Mat_SubSppt and attached it to submat */
2572       ierr = PetscNew(&smat_i);CHKERRQ(ierr);
2573       subc = (Mat_SeqAIJ*)submats[i]->data;
2574       subc->submatis1 = smat_i;
2575 
2576       smat_i->destroy          = submats[i]->ops->destroy;
2577       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2578       submats[i]->factortype   = C->factortype;
2579 
2580       smat_i->id          = i;
2581       smat_i->nrqs        = nrqs;
2582       smat_i->nrqr        = nrqr;
2583       smat_i->rbuf1       = rbuf1;
2584       smat_i->rbuf2       = rbuf2;
2585       smat_i->rbuf3       = rbuf3;
2586       smat_i->sbuf2       = sbuf2;
2587       smat_i->req_source2 = req_source2;
2588 
2589       smat_i->sbuf1       = sbuf1;
2590       smat_i->ptr         = ptr;
2591       smat_i->tmp         = tmp;
2592       smat_i->ctr         = ctr;
2593 
2594       smat_i->pa           = pa;
2595       smat_i->req_size     = req_size;
2596       smat_i->req_source1  = req_source1;
2597 
2598       smat_i->allcolumns  = allcolumns[i];
2599       smat_i->singleis    = PETSC_FALSE;
2600       smat_i->row2proc    = row2proc[i];
2601       smat_i->rmap        = rmap[i];
2602       smat_i->cmap        = cmap[i];
2603     }
2604 
2605     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2606       ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
2607       ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
2608       ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
2609 
2610       /* create struct Mat_SubSppt and attached it to submat */
2611       ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
2612       submats[0]->data = (void*)smat_i;
2613 
2614       smat_i->destroy          = submats[0]->ops->destroy;
2615       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2616       submats[0]->factortype   = C->factortype;
2617 
2618       smat_i->id          = 0;
2619       smat_i->nrqs        = nrqs;
2620       smat_i->nrqr        = nrqr;
2621       smat_i->rbuf1       = rbuf1;
2622       smat_i->rbuf2       = rbuf2;
2623       smat_i->rbuf3       = rbuf3;
2624       smat_i->sbuf2       = sbuf2;
2625       smat_i->req_source2 = req_source2;
2626 
2627       smat_i->sbuf1       = sbuf1;
2628       smat_i->ptr         = ptr;
2629       smat_i->tmp         = tmp;
2630       smat_i->ctr         = ctr;
2631 
2632       smat_i->pa           = pa;
2633       smat_i->req_size     = req_size;
2634       smat_i->req_source1  = req_source1;
2635 
2636       smat_i->allcolumns  = PETSC_FALSE;
2637       smat_i->singleis    = PETSC_FALSE;
2638       smat_i->row2proc    = NULL;
2639       smat_i->rmap        = NULL;
2640       smat_i->cmap        = NULL;
2641     }
2642 
2643     if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
2644     ierr = PetscFree(lens);CHKERRQ(ierr);
2645     if (sbuf_aj) {
2646       ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
2647       ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
2648     }
2649 
2650   } /* endof scall == MAT_INITIAL_MATRIX */
2651 
2652   /* Post recv matrix values */
2653   ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
2654   ierr = PetscMalloc1(nrqs,&rbuf4);CHKERRQ(ierr);
2655   ierr = PetscMalloc1(nrqs,&r_waits4);CHKERRQ(ierr);
2656   for (i=0; i<nrqs; ++i) {
2657     ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr);
2658     ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr);
2659   }
2660 
2661   /* Allocate sending buffers for a->a, and send them off */
2662   ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr);
2663   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2664   if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);}
2665   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2666 
2667   ierr = PetscMalloc1(nrqr,&s_waits4);CHKERRQ(ierr);
2668   {
2669     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2670     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2671     PetscInt    cend   = C->cmap->rend;
2672     PetscInt    *b_j   = b->j;
2673     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2674 
2675     ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2676     ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2677     for (i=0; i<nrqr; i++) {
2678       rbuf1_i   = rbuf1[i];
2679       sbuf_aa_i = sbuf_aa[i];
2680       ct1       = 2*rbuf1_i[0]+1;
2681       ct2       = 0;
2682       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2683         kmax = rbuf1_i[2*j];
2684         for (k=0; k<kmax; k++,ct1++) {
2685           row    = rbuf1_i[ct1] - rstart;
2686           nzA    = a_i[row+1] - a_i[row];
2687           nzB    = b_i[row+1] - b_i[row];
2688           ncols  = nzA + nzB;
2689           cworkB = b_j + b_i[row];
2690           vworkA = a_a + a_i[row];
2691           vworkB = b_a + b_i[row];
2692 
2693           /* load the column values for this row into vals*/
2694           vals = sbuf_aa_i+ct2;
2695 
2696           lwrite = 0;
2697           for (l=0; l<nzB; l++) {
2698             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2699           }
2700           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2701           for (l=0; l<nzB; l++) {
2702             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2703           }
2704 
2705           ct2 += ncols;
2706         }
2707       }
2708       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr);
2709     }
2710     ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr);
2711     ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr);
2712   }
2713 
2714   /* Assemble the matrices */
2715   /* First assemble the local rows */
2716   for (i=0; i<ismax; i++) {
2717     row2proc_i = row2proc[i];
2718     subc      = (Mat_SeqAIJ*)submats[i]->data;
2719     imat_ilen = subc->ilen;
2720     imat_j    = subc->j;
2721     imat_i    = subc->i;
2722     imat_a    = subc->a;
2723 
2724     if (!allcolumns[i]) cmap_i = cmap[i];
2725     rmap_i = rmap[i];
2726     irow_i = irow[i];
2727     jmax   = nrow[i];
2728     for (j=0; j<jmax; j++) {
2729       row  = irow_i[j];
2730       proc = row2proc_i[j];
2731       if (proc == rank) {
2732         old_row = row;
2733 #if defined(PETSC_USE_CTABLE)
2734         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2735         row--;
2736 #else
2737         row = rmap_i[row];
2738 #endif
2739         ilen_row = imat_ilen[row];
2740         ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2741         mat_i    = imat_i[row];
2742         mat_a    = imat_a + mat_i;
2743         mat_j    = imat_j + mat_i;
2744         if (!allcolumns[i]) {
2745           for (k=0; k<ncols; k++) {
2746 #if defined(PETSC_USE_CTABLE)
2747             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
2748 #else
2749             tcol = cmap_i[cols[k]];
2750 #endif
2751             if (tcol) {
2752               *mat_j++ = tcol - 1;
2753               *mat_a++ = vals[k];
2754               ilen_row++;
2755             }
2756           }
2757         } else { /* allcolumns */
2758           for (k=0; k<ncols; k++) {
2759             *mat_j++ = cols[k];  /* global col index! */
2760             *mat_a++ = vals[k];
2761             ilen_row++;
2762           }
2763         }
2764         ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
2765 
2766         imat_ilen[row] = ilen_row;
2767       }
2768     }
2769   }
2770 
2771   /* Now assemble the off proc rows */
2772   ierr    = MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2773   for (tmp2=0; tmp2<nrqs; tmp2++) {
2774     sbuf1_i = sbuf1[pa[tmp2]];
2775     jmax    = sbuf1_i[0];
2776     ct1     = 2*jmax + 1;
2777     ct2     = 0;
2778     rbuf2_i = rbuf2[tmp2];
2779     rbuf3_i = rbuf3[tmp2];
2780     rbuf4_i = rbuf4[tmp2];
2781     for (j=1; j<=jmax; j++) {
2782       is_no     = sbuf1_i[2*j-1];
2783       rmap_i    = rmap[is_no];
2784       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2785       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2786       imat_ilen = subc->ilen;
2787       imat_j    = subc->j;
2788       imat_i    = subc->i;
2789       imat_a    = subc->a;
2790       max1      = sbuf1_i[2*j];
2791       for (k=0; k<max1; k++,ct1++) {
2792         row = sbuf1_i[ct1];
2793 #if defined(PETSC_USE_CTABLE)
2794         ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
2795         row--;
2796 #else
2797         row = rmap_i[row];
2798 #endif
2799         ilen  = imat_ilen[row];
2800         mat_i = imat_i[row];
2801         mat_a = imat_a + mat_i;
2802         mat_j = imat_j + mat_i;
2803         max2  = rbuf2_i[ct1];
2804         if (!allcolumns[is_no]) {
2805           for (l=0; l<max2; l++,ct2++) {
2806 #if defined(PETSC_USE_CTABLE)
2807             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
2808 #else
2809             tcol = cmap_i[rbuf3_i[ct2]];
2810 #endif
2811             if (tcol) {
2812               *mat_j++ = tcol - 1;
2813               *mat_a++ = rbuf4_i[ct2];
2814               ilen++;
2815             }
2816           }
2817         } else { /* allcolumns */
2818           for (l=0; l<max2; l++,ct2++) {
2819             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2820             *mat_a++ = rbuf4_i[ct2];
2821             ilen++;
2822           }
2823         }
2824         imat_ilen[row] = ilen;
2825       }
2826     }
2827   }
2828 
2829   if (!iscsorted) { /* sort column indices of the rows */
2830     for (i=0; i<ismax; i++) {
2831       subc      = (Mat_SeqAIJ*)submats[i]->data;
2832       imat_j    = subc->j;
2833       imat_i    = subc->i;
2834       imat_a    = subc->a;
2835       imat_ilen = subc->ilen;
2836 
2837       if (allcolumns[i]) continue;
2838       jmax = nrow[i];
2839       for (j=0; j<jmax; j++) {
2840         mat_i = imat_i[j];
2841         mat_a = imat_a + mat_i;
2842         mat_j = imat_j + mat_i;
2843         ierr  = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr);
2844       }
2845     }
2846   }
2847 
2848   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
2849   ierr = MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
2850   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
2851 
2852   /* Restore the indices */
2853   for (i=0; i<ismax; i++) {
2854     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
2855     if (!allcolumns[i]) {
2856       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
2857     }
2858   }
2859 
2860   for (i=0; i<ismax; i++) {
2861     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2862     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2863   }
2864 
2865   /* Destroy allocated memory */
2866   if (sbuf_aa) {
2867     ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
2868     ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
2869   }
2870   ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
2871 
2872   for (i=0; i<nrqs; ++i) {
2873     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
2874   }
2875   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
2876 
2877   ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr);
2878   PetscFunctionReturn(0);
2879 }
2880 
2881 /*
2882  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2883  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2884  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2885  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2886  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2887  state, and needs to be "assembled" later by compressing B's column space.
2888 
2889  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2890  Following this call, C->A & C->B have been created, even if empty.
2891  */
2892 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2893 {
2894   /* If making this function public, change the error returned in this function away from _PLIB. */
2895   PetscErrorCode ierr;
2896   Mat_MPIAIJ     *aij;
2897   Mat_SeqAIJ     *Baij;
2898   PetscBool      seqaij,Bdisassembled;
2899   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2900   PetscScalar    v;
2901   const PetscInt *rowindices,*colindices;
2902 
2903   PetscFunctionBegin;
2904   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2905   if (A) {
2906     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2907     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2908     if (rowemb) {
2909       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2910       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2911     } else {
2912       if (C->rmap->n != A->rmap->n) {
2913         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2914       }
2915     }
2916     if (dcolemb) {
2917       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
2918       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2919     } else {
2920       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2921     }
2922   }
2923   if (B) {
2924     ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
2925     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2926     if (rowemb) {
2927       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
2928       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2929     } else {
2930       if (C->rmap->n != B->rmap->n) {
2931         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2932       }
2933     }
2934     if (ocolemb) {
2935       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
2936       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2937     } else {
2938       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2939     }
2940   }
2941 
2942   aij = (Mat_MPIAIJ*)C->data;
2943   if (!aij->A) {
2944     /* Mimic parts of MatMPIAIJSetPreallocation() */
2945     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
2946     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
2947     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
2948     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
2949     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
2950   }
2951   if (A) {
2952     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
2953   } else {
2954     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
2955   }
2956   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2957     /*
2958       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2959       need to "disassemble" B -- convert it to using C's global indices.
2960       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2961 
2962       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2963       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2964 
2965       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2966       At least avoid calling MatSetValues() and the implied searches?
2967     */
2968 
2969     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2970 #if defined(PETSC_USE_CTABLE)
2971       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
2972 #else
2973       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
2974       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2975       if (aij->B) {
2976         ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
2977       }
2978 #endif
2979       ngcol = 0;
2980       if (aij->lvec) {
2981         ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
2982       }
2983       if (aij->garray) {
2984         ierr = PetscFree(aij->garray);CHKERRQ(ierr);
2985         ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
2986       }
2987       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
2988       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
2989     }
2990     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2991       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
2992     }
2993     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2994       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
2995     }
2996   }
2997   Bdisassembled = PETSC_FALSE;
2998   if (!aij->B) {
2999     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
3000     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
3001     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
3002     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
3003     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
3004     Bdisassembled = PETSC_TRUE;
3005   }
3006   if (B) {
3007     Baij = (Mat_SeqAIJ*)B->data;
3008     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3009       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
3010       for (i=0; i<B->rmap->n; i++) {
3011         nz[i] = Baij->i[i+1] - Baij->i[i];
3012       }
3013       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
3014       ierr = PetscFree(nz);CHKERRQ(ierr);
3015     }
3016 
3017     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
3018     shift = rend-rstart;
3019     count = 0;
3020     rowindices = NULL;
3021     colindices = NULL;
3022     if (rowemb) {
3023       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
3024     }
3025     if (ocolemb) {
3026       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
3027     }
3028     for (i=0; i<B->rmap->n; i++) {
3029       PetscInt row;
3030       row = i;
3031       if (rowindices) row = rowindices[i];
3032       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3033         col  = Baij->j[count];
3034         if (colindices) col = colindices[col];
3035         if (Bdisassembled && col>=rstart) col += shift;
3036         v    = Baij->a[count];
3037         ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
3038         ++count;
3039       }
3040     }
3041     /* No assembly for aij->B is necessary. */
3042     /* FIXME: set aij->B's nonzerostate correctly. */
3043   } else {
3044     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
3045   }
3046   C->preallocated  = PETSC_TRUE;
3047   C->was_assembled = PETSC_FALSE;
3048   C->assembled     = PETSC_FALSE;
3049    /*
3050       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3051       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3052    */
3053   PetscFunctionReturn(0);
3054 }
3055 
3056 /*
3057   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3058  */
3059 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3060 {
3061   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3062 
3063   PetscFunctionBegin;
3064   PetscValidPointer(A,2);
3065   PetscValidPointer(B,3);
3066   /* FIXME: make sure C is assembled */
3067   *A = aij->A;
3068   *B = aij->B;
3069   /* Note that we don't incref *A and *B, so be careful! */
3070   PetscFunctionReturn(0);
3071 }
3072 
3073 /*
3074   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3075   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3076 */
3077 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3078                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3079                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3080                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3081                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3082 {
3083   PetscErrorCode ierr;
3084   PetscMPIInt    size,flag;
3085   PetscInt       i,ii,cismax,ispar;
3086   Mat            *A,*B;
3087   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3088 
3089   PetscFunctionBegin;
3090   if (!ismax) PetscFunctionReturn(0);
3091 
3092   for (i = 0, cismax = 0; i < ismax; ++i) {
3093     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr);
3094     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3095     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr);
3096     if (size > 1) ++cismax;
3097   }
3098 
3099   /*
3100      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3101      ispar counts the number of parallel ISs across C's comm.
3102   */
3103   ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr);
3104   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3105     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
3106     PetscFunctionReturn(0);
3107   }
3108 
3109   /* if (ispar) */
3110   /*
3111     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3112     These are used to extract the off-diag portion of the resulting parallel matrix.
3113     The row IS for the off-diag portion is the same as for the diag portion,
3114     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3115   */
3116   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
3117   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
3118   for (i = 0, ii = 0; i < ismax; ++i) {
3119     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3120     if (size > 1) {
3121       /*
3122          TODO: This is the part that's ***NOT SCALABLE***.
3123          To fix this we need to extract just the indices of C's nonzero columns
3124          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3125          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3126          to be done without serializing on the IS list, so, most likely, it is best
3127          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3128       */
3129       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
3130       /* Now we have to
3131          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3132              were sorted on each rank, concatenated they might no longer be sorted;
3133          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3134              indices in the nondecreasing order to the original index positions.
3135          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3136       */
3137       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
3138       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
3139       ++ii;
3140     }
3141   }
3142   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
3143   for (i = 0, ii = 0; i < ismax; ++i) {
3144     PetscInt       j,issize;
3145     const PetscInt *indices;
3146 
3147     /*
3148        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3149      */
3150     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
3151     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
3152     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
3153     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
3154     for (j = 1; j < issize; ++j) {
3155       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3156     }
3157     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
3158     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
3159     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
3160     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
3161     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
3162     for (j = 1; j < issize; ++j) {
3163       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3164     }
3165     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
3166     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3167     if (size > 1) {
3168       cisrow[ii] = isrow[i];
3169       ++ii;
3170     }
3171   }
3172   /*
3173     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3174     array of sequential matrices underlying the resulting parallel matrices.
3175     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3176     contain duplicates.
3177 
3178     There are as many diag matrices as there are original index sets. There are only as many parallel
3179     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3180 
3181     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3182     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3183       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3184       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3185       with A[i] and B[ii] extracted from the corresponding MPI submat.
3186     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3187       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3188       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3189       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3190       values into A[i] and B[ii] sitting inside the corresponding submat.
3191     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3192       A[i], B[ii], AA[i] or BB[ii] matrices.
3193   */
3194   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3195   if (scall == MAT_INITIAL_MATRIX) {
3196     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
3197   }
3198 
3199   /* Now obtain the sequential A and B submatrices separately. */
3200   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3201   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr);
3202   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr);
3203 
3204   /*
3205     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3206     matrices A & B have been extracted directly into the parallel matrices containing them, or
3207     simply into the sequential matrix identical with the corresponding A (if size == 1).
3208     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3209     to have the same sparsity pattern.
3210     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3211     must be constructed for C. This is done by setseqmat(s).
3212   */
3213   for (i = 0, ii = 0; i < ismax; ++i) {
3214     /*
3215        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3216        That way we can avoid sorting and computing permutations when reusing.
3217        To this end:
3218         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3219         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3220           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3221     */
3222     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3223 
3224     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr);
3225     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3226     if (size > 1) {
3227       if (scall == MAT_INITIAL_MATRIX) {
3228         ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
3229         ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
3230         ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
3231         ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
3232         ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
3233       }
3234       /*
3235         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3236       */
3237       {
3238         Mat AA = A[i],BB = B[ii];
3239 
3240         if (AA || BB) {
3241           ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
3242           ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3243           ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3244         }
3245         ierr = MatDestroy(&AA);CHKERRQ(ierr);
3246       }
3247       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
3248       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
3249       ++ii;
3250     } else { /* if (size == 1) */
3251       if (scall == MAT_REUSE_MATRIX) {
3252         ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr);
3253       }
3254       if (isrow_p[i] || iscol_p[i]) {
3255         ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
3256         ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
3257         /* Otherwise A is extracted straight into (*submats)[i]. */
3258         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3259         ierr = MatDestroy(A+i);CHKERRQ(ierr);
3260       } else (*submat)[i] = A[i];
3261     }
3262     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
3263     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
3264   }
3265   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
3266   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
3267   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
3268   ierr = PetscFree(A);CHKERRQ(ierr);
3269   ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr);
3270   PetscFunctionReturn(0);
3271 }
3272 
3273 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3274 {
3275   PetscErrorCode ierr;
3276 
3277   PetscFunctionBegin;
3278   ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
3279   PetscFunctionReturn(0);
3280 }
3281