xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision b122ec5aa1bd4469eb4e0673542fb7de3f411254) !
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 #include <petscsf.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
20 
21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22 {
23   PetscInt       i;
24 
25   PetscFunctionBegin;
26   PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
27   for (i=0; i<ov; ++i) {
28     CHKERRQ(MatIncreaseOverlap_MPIAIJ_Once(C,imax,is));
29   }
30   PetscFunctionReturn(0);
31 }
32 
33 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
34 {
35   PetscInt       i;
36 
37   PetscFunctionBegin;
38   PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
39   for (i=0; i<ov; ++i) {
40     CHKERRQ(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is));
41   }
42   PetscFunctionReturn(0);
43 }
44 
45 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
46 {
47   MPI_Comm       comm;
48   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
49   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
50   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
51   const PetscInt *indices_i,**indices;
52   PetscLayout    rmap;
53   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
54   PetscSF        sf;
55   PetscSFNode    *remote;
56 
57   PetscFunctionBegin;
58   CHKERRQ(PetscObjectGetComm((PetscObject)mat,&comm));
59   CHKERRMPI(MPI_Comm_rank(comm,&rank));
60   CHKERRMPI(MPI_Comm_size(comm,&size));
61   /* get row map to determine where rows should be going */
62   CHKERRQ(MatGetLayouts(mat,&rmap,NULL));
63   /* retrieve IS data and put all together so that we
64    * can optimize communication
65    *  */
66   CHKERRQ(PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length));
67   for (i=0,tlength=0; i<nidx; i++) {
68     CHKERRQ(ISGetLocalSize(is[i],&length[i]));
69     tlength += length[i];
70     CHKERRQ(ISGetIndices(is[i],&indices[i]));
71   }
72   /* find these rows on remote processors */
73   CHKERRQ(PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids));
74   CHKERRQ(PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp));
75   nrrows = 0;
76   for (i=0; i<nidx; i++) {
77     length_i  = length[i];
78     indices_i = indices[i];
79     for (j=0; j<length_i; j++) {
80       owner = -1;
81       CHKERRQ(PetscLayoutFindOwner(rmap,indices_i[j],&owner));
82       /* remote processors */
83       if (owner != rank) {
84         tosizes_temp[owner]++; /* number of rows to owner */
85         rrow_ranks[nrrows]  = owner; /* processor */
86         rrow_isids[nrrows]   = i; /* is id */
87         remoterows[nrrows++] = indices_i[j]; /* row */
88       }
89     }
90     CHKERRQ(ISRestoreIndices(is[i],&indices[i]));
91   }
92   CHKERRQ(PetscFree2(*(PetscInt***)&indices,length));
93   /* test if we need to exchange messages
94    * generally speaking, we do not need to exchange
95    * data when overlap is 1
96    * */
97   CHKERRMPI(MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm));
98   /* we do not have any messages
99    * It usually corresponds to overlap 1
100    * */
101   if (!reducednrrows) {
102     CHKERRQ(PetscFree3(toranks,tosizes,tosizes_temp));
103     CHKERRQ(PetscFree3(remoterows,rrow_ranks,rrow_isids));
104     CHKERRQ(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is));
105     PetscFunctionReturn(0);
106   }
107   nto = 0;
108   /* send sizes and ranks for building a two-sided communcation */
109   for (i=0; i<size; i++) {
110     if (tosizes_temp[i]) {
111       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
112       tosizes_temp[i] = nto; /* a map from processor to index */
113       toranks[nto++]  = i; /* processor */
114     }
115   }
116   CHKERRQ(PetscMalloc1(nto+1,&toffsets));
117   toffsets[0] = 0;
118   for (i=0; i<nto; i++) {
119     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
120     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
121   }
122   /* send information to other processors */
123   CHKERRQ(PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes));
124   nrecvrows = 0;
125   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
126   CHKERRQ(PetscMalloc1(nrecvrows,&remote));
127   nrecvrows = 0;
128   for (i=0; i<nfrom; i++) {
129     for (j=0; j<fromsizes[2*i]; j++) {
130       remote[nrecvrows].rank    = fromranks[i];
131       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
132     }
133   }
134   CHKERRQ(PetscSFCreate(comm,&sf));
135   CHKERRQ(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER));
136   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
137   CHKERRQ(PetscSFSetType(sf,PETSCSFBASIC));
138   CHKERRQ(PetscSFSetFromOptions(sf));
139   /* message pair <no of is, row>  */
140   CHKERRQ(PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata));
141   for (i=0; i<nrrows; i++) {
142     owner = rrow_ranks[i]; /* processor */
143     j     = tosizes_temp[owner]; /* index */
144     todata[toffsets[j]++] = rrow_isids[i];
145     todata[toffsets[j]++] = remoterows[i];
146   }
147   CHKERRQ(PetscFree3(toranks,tosizes,tosizes_temp));
148   CHKERRQ(PetscFree3(remoterows,rrow_ranks,rrow_isids));
149   CHKERRQ(PetscFree(toffsets));
150   CHKERRQ(PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE));
151   CHKERRQ(PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE));
152   CHKERRQ(PetscSFDestroy(&sf));
153   /* send rows belonging to the remote so that then we could get the overlapping data back */
154   CHKERRQ(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata));
155   CHKERRQ(PetscFree2(todata,fromdata));
156   CHKERRQ(PetscFree(fromsizes));
157   CHKERRQ(PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes));
158   CHKERRQ(PetscFree(fromranks));
159   nrecvrows = 0;
160   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
161   CHKERRQ(PetscCalloc1(nrecvrows,&todata));
162   CHKERRQ(PetscMalloc1(nrecvrows,&remote));
163   nrecvrows = 0;
164   for (i=0; i<nto; i++) {
165     for (j=0; j<tosizes[2*i]; j++) {
166       remote[nrecvrows].rank    = toranks[i];
167       remote[nrecvrows++].index = tosizes[2*i+1]+j;
168     }
169   }
170   CHKERRQ(PetscSFCreate(comm,&sf));
171   CHKERRQ(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER));
172   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
173   CHKERRQ(PetscSFSetType(sf,PETSCSFBASIC));
174   CHKERRQ(PetscSFSetFromOptions(sf));
175   /* overlap communication and computation */
176   CHKERRQ(PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE));
177   CHKERRQ(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is));
178   CHKERRQ(PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE));
179   CHKERRQ(PetscSFDestroy(&sf));
180   CHKERRQ(PetscFree2(sbdata,sbsizes));
181   CHKERRQ(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata));
182   CHKERRQ(PetscFree(toranks));
183   CHKERRQ(PetscFree(tosizes));
184   CHKERRQ(PetscFree(todata));
185   PetscFunctionReturn(0);
186 }
187 
188 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
189 {
190   PetscInt         *isz,isz_i,i,j,is_id, data_size;
191   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
192   const PetscInt   *indices_i_temp;
193   MPI_Comm         *iscomms;
194 
195   PetscFunctionBegin;
196   max_lsize = 0;
197   CHKERRQ(PetscMalloc1(nidx,&isz));
198   for (i=0; i<nidx; i++) {
199     CHKERRQ(ISGetLocalSize(is[i],&lsize));
200     max_lsize = lsize>max_lsize ? lsize:max_lsize;
201     isz[i]    = lsize;
202   }
203   CHKERRQ(PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms));
204   for (i=0; i<nidx; i++) {
205     CHKERRQ(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL));
206     CHKERRQ(ISGetIndices(is[i],&indices_i_temp));
207     CHKERRQ(PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]));
208     CHKERRQ(ISRestoreIndices(is[i],&indices_i_temp));
209     CHKERRQ(ISDestroy(&is[i]));
210   }
211   /* retrieve information to get row id and its overlap */
212   for (i=0; i<nrecvs;) {
213     is_id     = recvdata[i++];
214     data_size = recvdata[i++];
215     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
216     isz_i     = isz[is_id];
217     for (j=0; j< data_size; j++) {
218       col = recvdata[i++];
219       indices_i[isz_i++] = col;
220     }
221     isz[is_id] = isz_i;
222   }
223   /* remove duplicate entities */
224   for (i=0; i<nidx; i++) {
225     indices_i = indices_temp+(max_lsize+nrecvs)*i;
226     isz_i     = isz[i];
227     CHKERRQ(PetscSortRemoveDupsInt(&isz_i,indices_i));
228     CHKERRQ(ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]));
229     CHKERRQ(PetscCommDestroy(&iscomms[i]));
230   }
231   CHKERRQ(PetscFree(isz));
232   CHKERRQ(PetscFree2(indices_temp,iscomms));
233   PetscFunctionReturn(0);
234 }
235 
236 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
237 {
238   PetscLayout       rmap,cmap;
239   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
240   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
241   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
242   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
243   Mat               amat,bmat;
244   PetscMPIInt       rank;
245   PetscBool         done;
246   MPI_Comm          comm;
247 
248   PetscFunctionBegin;
249   CHKERRQ(PetscObjectGetComm((PetscObject)mat,&comm));
250   CHKERRMPI(MPI_Comm_rank(comm,&rank));
251   CHKERRQ(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols));
252   /* Even if the mat is symmetric, we still assume it is not symmetric */
253   CHKERRQ(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
254   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
255   CHKERRQ(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
256   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
257   /* total number of nonzero values is used to estimate the memory usage in the next step */
258   tnz  = ai[an]+bi[bn];
259   CHKERRQ(MatGetLayouts(mat,&rmap,&cmap));
260   CHKERRQ(PetscLayoutGetRange(rmap,&rstart,NULL));
261   CHKERRQ(PetscLayoutGetRange(cmap,&cstart,NULL));
262   /* to find the longest message */
263   max_fszs = 0;
264   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
265   /* better way to estimate number of nonzero in the mat??? */
266   CHKERRQ(PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp));
267   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
268   rows_pos  = 0;
269   totalrows = 0;
270   for (i=0; i<nfrom; i++) {
271     CHKERRQ(PetscArrayzero(rows_pos_i,nidx));
272     /* group data together */
273     for (j=0; j<fromsizes[2*i]; j+=2) {
274       is_id                       = fromrows[rows_pos++];/* no of is */
275       rows_i                      = rows_data[is_id];
276       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
277     }
278     /* estimate a space to avoid multiple allocations  */
279     for (j=0; j<nidx; j++) {
280       indvc_ij = 0;
281       rows_i   = rows_data[j];
282       for (l=0; l<rows_pos_i[j]; l++) {
283         row    = rows_i[l]-rstart;
284         start  = ai[row];
285         end    = ai[row+1];
286         for (k=start; k<end; k++) { /* Amat */
287           col = aj[k] + cstart;
288           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
289         }
290         start = bi[row];
291         end   = bi[row+1];
292         for (k=start; k<end; k++) { /* Bmat */
293           col = gcols[bj[k]];
294           indices_tmp[indvc_ij++] = col;
295         }
296       }
297       CHKERRQ(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp));
298       indv_counts[i*nidx+j] = indvc_ij;
299       totalrows            += indvc_ij;
300     }
301   }
302   /* message triple <no of is, number of rows, rows> */
303   CHKERRQ(PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes));
304   totalrows = 0;
305   rows_pos  = 0;
306   /* use this code again */
307   for (i=0;i<nfrom;i++) {
308     CHKERRQ(PetscArrayzero(rows_pos_i,nidx));
309     for (j=0; j<fromsizes[2*i]; j+=2) {
310       is_id                       = fromrows[rows_pos++];
311       rows_i                      = rows_data[is_id];
312       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
313     }
314     /* add data  */
315     for (j=0; j<nidx; j++) {
316       if (!indv_counts[i*nidx+j]) continue;
317       indvc_ij = 0;
318       sbdata[totalrows++] = j;
319       sbdata[totalrows++] = indv_counts[i*nidx+j];
320       sbsizes[2*i]       += 2;
321       rows_i              = rows_data[j];
322       for (l=0; l<rows_pos_i[j]; l++) {
323         row   = rows_i[l]-rstart;
324         start = ai[row];
325         end   = ai[row+1];
326         for (k=start; k<end; k++) { /* Amat */
327           col = aj[k] + cstart;
328           indices_tmp[indvc_ij++] = col;
329         }
330         start = bi[row];
331         end   = bi[row+1];
332         for (k=start; k<end; k++) { /* Bmat */
333           col = gcols[bj[k]];
334           indices_tmp[indvc_ij++] = col;
335         }
336       }
337       CHKERRQ(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp));
338       sbsizes[2*i]  += indvc_ij;
339       CHKERRQ(PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij));
340       totalrows += indvc_ij;
341     }
342   }
343   CHKERRQ(PetscMalloc1(nfrom+1,&offsets));
344   offsets[0] = 0;
345   for (i=0; i<nfrom; i++) {
346     offsets[i+1]   = offsets[i] + sbsizes[2*i];
347     sbsizes[2*i+1] = offsets[i];
348   }
349   CHKERRQ(PetscFree(offsets));
350   if (sbrowsizes) *sbrowsizes = sbsizes;
351   if (sbrows) *sbrows = sbdata;
352   CHKERRQ(PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp));
353   CHKERRQ(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
354   CHKERRQ(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
355   PetscFunctionReturn(0);
356 }
357 
358 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
359 {
360   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
361   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
362   PetscInt          lsize,lsize_tmp;
363   PetscMPIInt       rank,owner;
364   Mat               amat,bmat;
365   PetscBool         done;
366   PetscLayout       cmap,rmap;
367   MPI_Comm          comm;
368 
369   PetscFunctionBegin;
370   CHKERRQ(PetscObjectGetComm((PetscObject)mat,&comm));
371   CHKERRMPI(MPI_Comm_rank(comm,&rank));
372   CHKERRQ(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols));
373   CHKERRQ(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
374   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
375   CHKERRQ(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
376   PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ ");
377   /* is it a safe way to compute number of nonzero values ? */
378   tnz  = ai[an]+bi[bn];
379   CHKERRQ(MatGetLayouts(mat,&rmap,&cmap));
380   CHKERRQ(PetscLayoutGetRange(rmap,&rstart,NULL));
381   CHKERRQ(PetscLayoutGetRange(cmap,&cstart,NULL));
382   /* it is a better way to estimate memory than the old implementation
383    * where global size of matrix is used
384    * */
385   CHKERRQ(PetscMalloc1(tnz,&indices_temp));
386   for (i=0; i<nidx; i++) {
387     MPI_Comm iscomm;
388 
389     CHKERRQ(ISGetLocalSize(is[i],&lsize));
390     CHKERRQ(ISGetIndices(is[i],&indices));
391     lsize_tmp = 0;
392     for (j=0; j<lsize; j++) {
393       owner = -1;
394       row   = indices[j];
395       CHKERRQ(PetscLayoutFindOwner(rmap,row,&owner));
396       if (owner != rank) continue;
397       /* local number */
398       row  -= rstart;
399       start = ai[row];
400       end   = ai[row+1];
401       for (k=start; k<end; k++) { /* Amat */
402         col = aj[k] + cstart;
403         indices_temp[lsize_tmp++] = col;
404       }
405       start = bi[row];
406       end   = bi[row+1];
407       for (k=start; k<end; k++) { /* Bmat */
408         col = gcols[bj[k]];
409         indices_temp[lsize_tmp++] = col;
410       }
411     }
412    CHKERRQ(ISRestoreIndices(is[i],&indices));
413    CHKERRQ(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL));
414    CHKERRQ(ISDestroy(&is[i]));
415    CHKERRQ(PetscSortRemoveDupsInt(&lsize_tmp,indices_temp));
416    CHKERRQ(ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]));
417    CHKERRQ(PetscCommDestroy(&iscomm));
418   }
419   CHKERRQ(PetscFree(indices_temp));
420   CHKERRQ(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done));
421   CHKERRQ(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done));
422   PetscFunctionReturn(0);
423 }
424 
425 /*
426   Sample message format:
427   If a processor A wants processor B to process some elements corresponding
428   to index sets is[1],is[5]
429   mesg [0] = 2   (no of index sets in the mesg)
430   -----------
431   mesg [1] = 1 => is[1]
432   mesg [2] = sizeof(is[1]);
433   -----------
434   mesg [3] = 5  => is[5]
435   mesg [4] = sizeof(is[5]);
436   -----------
437   mesg [5]
438   mesg [n]  datas[1]
439   -----------
440   mesg[n+1]
441   mesg[m]  data(is[5])
442   -----------
443 
444   Notes:
445   nrqs - no of requests sent (or to be sent out)
446   nrqr - no of requests received (which have to be or which have been processed)
447 */
448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449 {
450   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452   const PetscInt **idx,*idx_i;
453   PetscInt       *n,**data,len;
454 #if defined(PETSC_USE_CTABLE)
455   PetscTable     *table_data,table_data_i;
456   PetscInt       *tdata,tcount,tcount_max;
457 #else
458   PetscInt       *data_i,*d_p;
459 #endif
460   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
461   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
462   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
463   PetscBT        *table;
464   MPI_Comm       comm;
465   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
466   MPI_Status     *recv_status;
467   MPI_Comm       *iscomms;
468   char           *t_p;
469 
470   PetscFunctionBegin;
471   CHKERRQ(PetscObjectGetComm((PetscObject)C,&comm));
472   size = c->size;
473   rank = c->rank;
474   M    = C->rmap->N;
475 
476   CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag1));
477   CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag2));
478 
479   CHKERRQ(PetscMalloc2(imax,(PetscInt***)&idx,imax,&n));
480 
481   for (i=0; i<imax; i++) {
482     CHKERRQ(ISGetIndices(is[i],&idx[i]));
483     CHKERRQ(ISGetLocalSize(is[i],&n[i]));
484   }
485 
486   /* evaluate communication - mesg to who,length of mesg, and buffer space
487      required. Based on this, buffers are allocated, and data copied into them  */
488   CHKERRQ(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));
489   for (i=0; i<imax; i++) {
490     CHKERRQ(PetscArrayzero(w4,size)); /* initialise work vector*/
491     idx_i = idx[i];
492     len   = n[i];
493     for (j=0; j<len; j++) {
494       row = idx_i[j];
495       PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
496       CHKERRQ(PetscLayoutFindOwner(C->rmap,row,&proc));
497       w4[proc]++;
498     }
499     for (j=0; j<size; j++) {
500       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
501     }
502   }
503 
504   nrqs     = 0;              /* no of outgoing messages */
505   msz      = 0;              /* total mesg length (for all proc */
506   w1[rank] = 0;              /* no mesg sent to intself */
507   w3[rank] = 0;
508   for (i=0; i<size; i++) {
509     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
510   }
511   /* pa - is list of processors to communicate with */
512   CHKERRQ(PetscMalloc1(nrqs,&pa));
513   for (i=0,j=0; i<size; i++) {
514     if (w1[i]) {pa[j] = i; j++;}
515   }
516 
517   /* Each message would have a header = 1 + 2*(no of IS) + data */
518   for (i=0; i<nrqs; i++) {
519     j      = pa[i];
520     w1[j] += w2[j] + 2*w3[j];
521     msz   += w1[j];
522   }
523 
524   /* Determine the number of messages to expect, their lengths, from from-ids */
525   CHKERRQ(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
526   CHKERRQ(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
527 
528   /* Now post the Irecvs corresponding to these messages */
529   CHKERRQ(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1));
530 
531   /* Allocate Memory for outgoing messages */
532   CHKERRQ(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr));
533   CHKERRQ(PetscArrayzero(outdat,size));
534   CHKERRQ(PetscArrayzero(ptr,size));
535 
536   {
537     PetscInt *iptr = tmp,ict  = 0;
538     for (i=0; i<nrqs; i++) {
539       j         = pa[i];
540       iptr     +=  ict;
541       outdat[j] = iptr;
542       ict       = w1[j];
543     }
544   }
545 
546   /* Form the outgoing messages */
547   /* plug in the headers */
548   for (i=0; i<nrqs; i++) {
549     j            = pa[i];
550     outdat[j][0] = 0;
551     CHKERRQ(PetscArrayzero(outdat[j]+1,2*w3[j]));
552     ptr[j]       = outdat[j] + 2*w3[j] + 1;
553   }
554 
555   /* Memory for doing local proc's work */
556   {
557     PetscInt M_BPB_imax = 0;
558 #if defined(PETSC_USE_CTABLE)
559     CHKERRQ(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax));
560     CHKERRQ(PetscMalloc1(imax,&table_data));
561     for (i=0; i<imax; i++) {
562       CHKERRQ(PetscTableCreate(n[i],M,&table_data[i]));
563     }
564     CHKERRQ(PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p));
565     for (i=0; i<imax; i++) {
566       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
567     }
568 #else
569     PetscInt Mimax = 0;
570     CHKERRQ(PetscIntMultError(M,imax, &Mimax));
571     CHKERRQ(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax));
572     CHKERRQ(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p));
573     for (i=0; i<imax; i++) {
574       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
575       data[i]  = d_p + M*i;
576     }
577 #endif
578   }
579 
580   /* Parse the IS and update local tables and the outgoing buf with the data */
581   {
582     PetscInt n_i,isz_i,*outdat_j,ctr_j;
583     PetscBT  table_i;
584 
585     for (i=0; i<imax; i++) {
586       CHKERRQ(PetscArrayzero(ctr,size));
587       n_i     = n[i];
588       table_i = table[i];
589       idx_i   = idx[i];
590 #if defined(PETSC_USE_CTABLE)
591       table_data_i = table_data[i];
592 #else
593       data_i  = data[i];
594 #endif
595       isz_i   = isz[i];
596       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
597         row  = idx_i[j];
598         CHKERRQ(PetscLayoutFindOwner(C->rmap,row,&proc));
599         if (proc != rank) { /* copy to the outgoing buffer */
600           ctr[proc]++;
601           *ptr[proc] = row;
602           ptr[proc]++;
603         } else if (!PetscBTLookupSet(table_i,row)) {
604 #if defined(PETSC_USE_CTABLE)
605           CHKERRQ(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES));
606 #else
607           data_i[isz_i] = row; /* Update the local table */
608 #endif
609           isz_i++;
610         }
611       }
612       /* Update the headers for the current IS */
613       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
614         if ((ctr_j = ctr[j])) {
615           outdat_j        = outdat[j];
616           k               = ++outdat_j[0];
617           outdat_j[2*k]   = ctr_j;
618           outdat_j[2*k-1] = i;
619         }
620       }
621       isz[i] = isz_i;
622     }
623   }
624 
625   /*  Now  post the sends */
626   CHKERRQ(PetscMalloc1(nrqs,&s_waits1));
627   for (i=0; i<nrqs; ++i) {
628     j    = pa[i];
629     CHKERRMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i));
630   }
631 
632   /* No longer need the original indices */
633   CHKERRQ(PetscMalloc1(imax,&iscomms));
634   for (i=0; i<imax; ++i) {
635     CHKERRQ(ISRestoreIndices(is[i],idx+i));
636     CHKERRQ(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL));
637   }
638   CHKERRQ(PetscFree2(*(PetscInt***)&idx,n));
639 
640   for (i=0; i<imax; ++i) {
641     CHKERRQ(ISDestroy(&is[i]));
642   }
643 
644   /* Do Local work */
645 #if defined(PETSC_USE_CTABLE)
646   CHKERRQ(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data));
647 #else
648   CHKERRQ(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL));
649 #endif
650 
651   /* Receive messages */
652   CHKERRQ(PetscMalloc1(nrqr,&recv_status));
653   CHKERRMPI(MPI_Waitall(nrqr,r_waits1,recv_status));
654   CHKERRMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
655 
656   /* Phase 1 sends are complete - deallocate buffers */
657   CHKERRQ(PetscFree4(outdat,ptr,tmp,ctr));
658   CHKERRQ(PetscFree4(w1,w2,w3,w4));
659 
660   CHKERRQ(PetscMalloc1(nrqr,&xdata));
661   CHKERRQ(PetscMalloc1(nrqr,&isz1));
662   CHKERRQ(MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1));
663   CHKERRQ(PetscFree(rbuf[0]));
664   CHKERRQ(PetscFree(rbuf));
665 
666   /* Send the data back */
667   /* Do a global reduction to know the buffer space req for incoming messages */
668   {
669     PetscMPIInt *rw1;
670 
671     CHKERRQ(PetscCalloc1(size,&rw1));
672 
673     for (i=0; i<nrqr; ++i) {
674       proc = recv_status[i].MPI_SOURCE;
675 
676       PetscCheckFalse(proc != onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
677       rw1[proc] = isz1[i];
678     }
679     CHKERRQ(PetscFree(onodes1));
680     CHKERRQ(PetscFree(olengths1));
681 
682     /* Determine the number of messages to expect, their lengths, from from-ids */
683     CHKERRQ(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2));
684     CHKERRQ(PetscFree(rw1));
685   }
686   /* Now post the Irecvs corresponding to these messages */
687   CHKERRQ(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2));
688 
689   /* Now  post the sends */
690   CHKERRQ(PetscMalloc1(nrqr,&s_waits2));
691   for (i=0; i<nrqr; ++i) {
692     j    = recv_status[i].MPI_SOURCE;
693     CHKERRMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i));
694   }
695 
696   /* receive work done on other processors */
697   {
698     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
699     PetscMPIInt idex;
700     PetscBT     table_i;
701 
702     for (i=0; i<nrqs; ++i) {
703       CHKERRMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE));
704       /* Process the message */
705       rbuf2_i = rbuf2[idex];
706       ct1     = 2*rbuf2_i[0]+1;
707       jmax    = rbuf2[idex][0];
708       for (j=1; j<=jmax; j++) {
709         max     = rbuf2_i[2*j];
710         is_no   = rbuf2_i[2*j-1];
711         isz_i   = isz[is_no];
712         table_i = table[is_no];
713 #if defined(PETSC_USE_CTABLE)
714         table_data_i = table_data[is_no];
715 #else
716         data_i  = data[is_no];
717 #endif
718         for (k=0; k<max; k++,ct1++) {
719           row = rbuf2_i[ct1];
720           if (!PetscBTLookupSet(table_i,row)) {
721 #if defined(PETSC_USE_CTABLE)
722             CHKERRQ(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES));
723 #else
724             data_i[isz_i] = row;
725 #endif
726             isz_i++;
727           }
728         }
729         isz[is_no] = isz_i;
730       }
731     }
732 
733     CHKERRMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
734   }
735 
736 #if defined(PETSC_USE_CTABLE)
737   tcount_max = 0;
738   for (i=0; i<imax; ++i) {
739     table_data_i = table_data[i];
740     CHKERRQ(PetscTableGetCount(table_data_i,&tcount));
741     if (tcount_max < tcount) tcount_max = tcount;
742   }
743   CHKERRQ(PetscMalloc1(tcount_max+1,&tdata));
744 #endif
745 
746   for (i=0; i<imax; ++i) {
747 #if defined(PETSC_USE_CTABLE)
748     PetscTablePosition tpos;
749     table_data_i = table_data[i];
750 
751     CHKERRQ(PetscTableGetHeadPosition(table_data_i,&tpos));
752     while (tpos) {
753       CHKERRQ(PetscTableGetNext(table_data_i,&tpos,&k,&j));
754       tdata[--j] = --k;
755     }
756     CHKERRQ(ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i));
757 #else
758     CHKERRQ(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i));
759 #endif
760     CHKERRQ(PetscCommDestroy(&iscomms[i]));
761   }
762 
763   CHKERRQ(PetscFree(iscomms));
764   CHKERRQ(PetscFree(onodes2));
765   CHKERRQ(PetscFree(olengths2));
766 
767   CHKERRQ(PetscFree(pa));
768   CHKERRQ(PetscFree(rbuf2[0]));
769   CHKERRQ(PetscFree(rbuf2));
770   CHKERRQ(PetscFree(s_waits1));
771   CHKERRQ(PetscFree(r_waits1));
772   CHKERRQ(PetscFree(s_waits2));
773   CHKERRQ(PetscFree(r_waits2));
774   CHKERRQ(PetscFree(recv_status));
775   if (xdata) {
776     CHKERRQ(PetscFree(xdata[0]));
777     CHKERRQ(PetscFree(xdata));
778   }
779   CHKERRQ(PetscFree(isz1));
780 #if defined(PETSC_USE_CTABLE)
781   for (i=0; i<imax; i++) {
782     CHKERRQ(PetscTableDestroy((PetscTable*)&table_data[i]));
783   }
784   CHKERRQ(PetscFree(table_data));
785   CHKERRQ(PetscFree(tdata));
786   CHKERRQ(PetscFree4(table,data,isz,t_p));
787 #else
788   CHKERRQ(PetscFree5(table,data,isz,d_p,t_p));
789 #endif
790   PetscFunctionReturn(0);
791 }
792 
793 /*
794    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
795        the work on the local processor.
796 
797      Inputs:
798       C      - MAT_MPIAIJ;
799       imax - total no of index sets processed at a time;
800       table  - an array of char - size = m bits.
801 
802      Output:
803       isz    - array containing the count of the solution elements corresponding
804                to each index set;
805       data or table_data  - pointer to the solutions
806 */
807 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
808 {
809   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
810   Mat        A  = c->A,B = c->B;
811   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
812   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
813   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
814   PetscBT    table_i;
815 #if defined(PETSC_USE_CTABLE)
816   PetscTable         table_data_i;
817   PetscTablePosition tpos;
818   PetscInt           tcount,*tdata;
819 #else
820   PetscInt           *data_i;
821 #endif
822 
823   PetscFunctionBegin;
824   rstart = C->rmap->rstart;
825   cstart = C->cmap->rstart;
826   ai     = a->i;
827   aj     = a->j;
828   bi     = b->i;
829   bj     = b->j;
830   garray = c->garray;
831 
832   for (i=0; i<imax; i++) {
833 #if defined(PETSC_USE_CTABLE)
834     /* copy existing entries of table_data_i into tdata[] */
835     table_data_i = table_data[i];
836     CHKERRQ(PetscTableGetCount(table_data_i,&tcount));
837     PetscCheckFalse(tcount != isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]);
838 
839     CHKERRQ(PetscMalloc1(tcount,&tdata));
840     CHKERRQ(PetscTableGetHeadPosition(table_data_i,&tpos));
841     while (tpos) {
842       CHKERRQ(PetscTableGetNext(table_data_i,&tpos,&row,&j));
843       tdata[--j] = --row;
844       PetscCheckFalse(j > tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount);
845     }
846 #else
847     data_i  = data[i];
848 #endif
849     table_i = table[i];
850     isz_i   = isz[i];
851     max     = isz[i];
852 
853     for (j=0; j<max; j++) {
854 #if defined(PETSC_USE_CTABLE)
855       row   = tdata[j] - rstart;
856 #else
857       row   = data_i[j] - rstart;
858 #endif
859       start = ai[row];
860       end   = ai[row+1];
861       for (k=start; k<end; k++) { /* Amat */
862         val = aj[k] + cstart;
863         if (!PetscBTLookupSet(table_i,val)) {
864 #if defined(PETSC_USE_CTABLE)
865           CHKERRQ(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES));
866 #else
867           data_i[isz_i] = val;
868 #endif
869           isz_i++;
870         }
871       }
872       start = bi[row];
873       end   = bi[row+1];
874       for (k=start; k<end; k++) { /* Bmat */
875         val = garray[bj[k]];
876         if (!PetscBTLookupSet(table_i,val)) {
877 #if defined(PETSC_USE_CTABLE)
878           CHKERRQ(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES));
879 #else
880           data_i[isz_i] = val;
881 #endif
882           isz_i++;
883         }
884       }
885     }
886     isz[i] = isz_i;
887 
888 #if defined(PETSC_USE_CTABLE)
889     CHKERRQ(PetscFree(tdata));
890 #endif
891   }
892   PetscFunctionReturn(0);
893 }
894 
895 /*
896       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
897          and return the output
898 
899          Input:
900            C    - the matrix
901            nrqr - no of messages being processed.
902            rbuf - an array of pointers to the received requests
903 
904          Output:
905            xdata - array of messages to be sent back
906            isz1  - size of each message
907 
908   For better efficiency perhaps we should malloc separately each xdata[i],
909 then if a remalloc is required we need only copy the data for that one row
910 rather then all previous rows as it is now where a single large chunk of
911 memory is used.
912 
913 */
914 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
915 {
916   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
917   Mat            A  = c->A,B = c->B;
918   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
919   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
920   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
921   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
922   PetscInt       *rbuf_i,kmax,rbuf_0;
923   PetscBT        xtable;
924 
925   PetscFunctionBegin;
926   m      = C->rmap->N;
927   rstart = C->rmap->rstart;
928   cstart = C->cmap->rstart;
929   ai     = a->i;
930   aj     = a->j;
931   bi     = b->i;
932   bj     = b->j;
933   garray = c->garray;
934 
935   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
936     rbuf_i =  rbuf[i];
937     rbuf_0 =  rbuf_i[0];
938     ct    += rbuf_0;
939     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
940   }
941 
942   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
943   else max1 = 1;
944   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
945   if (nrqr) {
946     CHKERRQ(PetscMalloc1(mem_estimate,&xdata[0]));
947     ++no_malloc;
948   }
949   CHKERRQ(PetscBTCreate(m,&xtable));
950   CHKERRQ(PetscArrayzero(isz1,nrqr));
951 
952   ct3 = 0;
953   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
954     rbuf_i =  rbuf[i];
955     rbuf_0 =  rbuf_i[0];
956     ct1    =  2*rbuf_0+1;
957     ct2    =  ct1;
958     ct3   += ct1;
959     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
960       CHKERRQ(PetscBTMemzero(m,xtable));
961       oct2 = ct2;
962       kmax = rbuf_i[2*j];
963       for (k=0; k<kmax; k++,ct1++) {
964         row = rbuf_i[ct1];
965         if (!PetscBTLookupSet(xtable,row)) {
966           if (!(ct3 < mem_estimate)) {
967             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
968             CHKERRQ(PetscMalloc1(new_estimate,&tmp));
969             CHKERRQ(PetscArraycpy(tmp,xdata[0],mem_estimate));
970             CHKERRQ(PetscFree(xdata[0]));
971             xdata[0]     = tmp;
972             mem_estimate = new_estimate; ++no_malloc;
973             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
974           }
975           xdata[i][ct2++] = row;
976           ct3++;
977         }
978       }
979       for (k=oct2,max2=ct2; k<max2; k++) {
980         row   = xdata[i][k] - rstart;
981         start = ai[row];
982         end   = ai[row+1];
983         for (l=start; l<end; l++) {
984           val = aj[l] + cstart;
985           if (!PetscBTLookupSet(xtable,val)) {
986             if (!(ct3 < mem_estimate)) {
987               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
988               CHKERRQ(PetscMalloc1(new_estimate,&tmp));
989               CHKERRQ(PetscArraycpy(tmp,xdata[0],mem_estimate));
990               CHKERRQ(PetscFree(xdata[0]));
991               xdata[0]     = tmp;
992               mem_estimate = new_estimate; ++no_malloc;
993               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
994             }
995             xdata[i][ct2++] = val;
996             ct3++;
997           }
998         }
999         start = bi[row];
1000         end   = bi[row+1];
1001         for (l=start; l<end; l++) {
1002           val = garray[bj[l]];
1003           if (!PetscBTLookupSet(xtable,val)) {
1004             if (!(ct3 < mem_estimate)) {
1005               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1006               CHKERRQ(PetscMalloc1(new_estimate,&tmp));
1007               CHKERRQ(PetscArraycpy(tmp,xdata[0],mem_estimate));
1008               CHKERRQ(PetscFree(xdata[0]));
1009               xdata[0]     = tmp;
1010               mem_estimate = new_estimate; ++no_malloc;
1011               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1012             }
1013             xdata[i][ct2++] = val;
1014             ct3++;
1015           }
1016         }
1017       }
1018       /* Update the header*/
1019       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1020       xdata[i][2*j-1] = rbuf_i[2*j-1];
1021     }
1022     xdata[i][0] = rbuf_0;
1023     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1024     isz1[i]     = ct2; /* size of each message */
1025   }
1026   CHKERRQ(PetscBTDestroy(&xtable));
1027   CHKERRQ(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc));
1028   PetscFunctionReturn(0);
1029 }
1030 /* -------------------------------------------------------------------------*/
1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1032 /*
1033     Every processor gets the entire matrix
1034 */
1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1036 {
1037   Mat            B;
1038   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1039   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1040   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1041   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1042   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1043 
1044   PetscFunctionBegin;
1045   CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1046   CHKERRMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank));
1047   if (scall == MAT_INITIAL_MATRIX) {
1048     /* ----------------------------------------------------------------
1049          Tell every processor the number of nonzeros per row
1050     */
1051     CHKERRQ(PetscMalloc1(A->rmap->N,&lens));
1052     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1053       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1054     }
1055     CHKERRQ(PetscMalloc2(size,&recvcounts,size,&displs));
1056     for (i=0; i<size; i++) {
1057       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1058       displs[i]     = A->rmap->range[i];
1059     }
1060     CHKERRMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A)));
1061     /* ---------------------------------------------------------------
1062          Create the sequential matrix of the same type as the local block diagonal
1063     */
1064     CHKERRQ(MatCreate(PETSC_COMM_SELF,&B));
1065     CHKERRQ(MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE));
1066     CHKERRQ(MatSetBlockSizesFromMats(B,A,A));
1067     CHKERRQ(MatSetType(B,((PetscObject)a->A)->type_name));
1068     CHKERRQ(MatSeqAIJSetPreallocation(B,0,lens));
1069     CHKERRQ(PetscCalloc1(2,Bin));
1070     **Bin = B;
1071     b     = (Mat_SeqAIJ*)B->data;
1072 
1073     /*--------------------------------------------------------------------
1074        Copy my part of matrix column indices over
1075     */
1076     sendcount  = ad->nz + bd->nz;
1077     jsendbuf   = b->j + b->i[rstarts[rank]];
1078     a_jsendbuf = ad->j;
1079     b_jsendbuf = bd->j;
1080     n          = A->rmap->rend - A->rmap->rstart;
1081     cnt        = 0;
1082     for (i=0; i<n; i++) {
1083       /* put in lower diagonal portion */
1084       m = bd->i[i+1] - bd->i[i];
1085       while (m > 0) {
1086         /* is it above diagonal (in bd (compressed) numbering) */
1087         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1088         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1089         m--;
1090       }
1091 
1092       /* put in diagonal portion */
1093       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1094         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1095       }
1096 
1097       /* put in upper diagonal portion */
1098       while (m-- > 0) {
1099         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1100       }
1101     }
1102     PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1103 
1104     /*--------------------------------------------------------------------
1105        Gather all column indices to all processors
1106     */
1107     for (i=0; i<size; i++) {
1108       recvcounts[i] = 0;
1109       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1110         recvcounts[i] += lens[j];
1111       }
1112     }
1113     displs[0] = 0;
1114     for (i=1; i<size; i++) {
1115       displs[i] = displs[i-1] + recvcounts[i-1];
1116     }
1117     CHKERRMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A)));
1118     /*--------------------------------------------------------------------
1119         Assemble the matrix into useable form (note numerical values not yet set)
1120     */
1121     /* set the b->ilen (length of each row) values */
1122     CHKERRQ(PetscArraycpy(b->ilen,lens,A->rmap->N));
1123     /* set the b->i indices */
1124     b->i[0] = 0;
1125     for (i=1; i<=A->rmap->N; i++) {
1126       b->i[i] = b->i[i-1] + lens[i-1];
1127     }
1128     CHKERRQ(PetscFree(lens));
1129     CHKERRQ(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1130     CHKERRQ(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1131 
1132   } else {
1133     B = **Bin;
1134     b = (Mat_SeqAIJ*)B->data;
1135   }
1136 
1137   /*--------------------------------------------------------------------
1138        Copy my part of matrix numerical values into the values location
1139   */
1140   if (flag == MAT_GET_VALUES) {
1141     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1142     MatScalar         *sendbuf,*recvbuf;
1143 
1144     CHKERRQ(MatSeqAIJGetArrayRead(a->A,&ada));
1145     CHKERRQ(MatSeqAIJGetArrayRead(a->B,&bda));
1146     sendcount = ad->nz + bd->nz;
1147     sendbuf   = b->a + b->i[rstarts[rank]];
1148     a_sendbuf = ada;
1149     b_sendbuf = bda;
1150     b_sendj   = bd->j;
1151     n         = A->rmap->rend - A->rmap->rstart;
1152     cnt       = 0;
1153     for (i=0; i<n; i++) {
1154       /* put in lower diagonal portion */
1155       m = bd->i[i+1] - bd->i[i];
1156       while (m > 0) {
1157         /* is it above diagonal (in bd (compressed) numbering) */
1158         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1159         sendbuf[cnt++] = *b_sendbuf++;
1160         m--;
1161         b_sendj++;
1162       }
1163 
1164       /* put in diagonal portion */
1165       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1166         sendbuf[cnt++] = *a_sendbuf++;
1167       }
1168 
1169       /* put in upper diagonal portion */
1170       while (m-- > 0) {
1171         sendbuf[cnt++] = *b_sendbuf++;
1172         b_sendj++;
1173       }
1174     }
1175     PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt);
1176 
1177     /* -----------------------------------------------------------------
1178        Gather all numerical values to all processors
1179     */
1180     if (!recvcounts) {
1181       CHKERRQ(PetscMalloc2(size,&recvcounts,size,&displs));
1182     }
1183     for (i=0; i<size; i++) {
1184       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1185     }
1186     displs[0] = 0;
1187     for (i=1; i<size; i++) {
1188       displs[i] = displs[i-1] + recvcounts[i-1];
1189     }
1190     recvbuf = b->a;
1191     CHKERRMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A)));
1192     CHKERRQ(MatSeqAIJRestoreArrayRead(a->A,&ada));
1193     CHKERRQ(MatSeqAIJRestoreArrayRead(a->B,&bda));
1194   }  /* endof (flag == MAT_GET_VALUES) */
1195   CHKERRQ(PetscFree2(recvcounts,displs));
1196 
1197   if (A->symmetric) {
1198     CHKERRQ(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE));
1199   } else if (A->hermitian) {
1200     CHKERRQ(MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE));
1201   } else if (A->structurally_symmetric) {
1202     CHKERRQ(MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE));
1203   }
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1208 {
1209   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1210   Mat            submat,A = c->A,B = c->B;
1211   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1212   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1213   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1214   const PetscInt *icol,*irow;
1215   PetscInt       nrow,ncol,start;
1216   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1217   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1218   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1219   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1220   PetscInt       *lens,rmax,ncols,*cols,Crow;
1221 #if defined(PETSC_USE_CTABLE)
1222   PetscTable     cmap,rmap;
1223   PetscInt       *cmap_loc,*rmap_loc;
1224 #else
1225   PetscInt       *cmap,*rmap;
1226 #endif
1227   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1228   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1229   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1230   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1231   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1232   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1233   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1234   MPI_Comm       comm;
1235   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1236   PetscMPIInt    *onodes1,*olengths1,idex,end;
1237   Mat_SubSppt    *smatis1;
1238   PetscBool      isrowsorted,iscolsorted;
1239 
1240   PetscFunctionBegin;
1241   PetscValidLogicalCollectiveInt(C,ismax,2);
1242   PetscValidLogicalCollectiveEnum(C,scall,5);
1243   PetscCheckFalse(ismax != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1244   CHKERRQ(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a));
1245   CHKERRQ(MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a));
1246   CHKERRQ(PetscObjectGetComm((PetscObject)C,&comm));
1247   size = c->size;
1248   rank = c->rank;
1249 
1250   CHKERRQ(ISSorted(iscol[0],&iscolsorted));
1251   CHKERRQ(ISSorted(isrow[0],&isrowsorted));
1252   CHKERRQ(ISGetIndices(isrow[0],&irow));
1253   CHKERRQ(ISGetLocalSize(isrow[0],&nrow));
1254   if (allcolumns) {
1255     icol = NULL;
1256     ncol = C->cmap->N;
1257   } else {
1258     CHKERRQ(ISGetIndices(iscol[0],&icol));
1259     CHKERRQ(ISGetLocalSize(iscol[0],&ncol));
1260   }
1261 
1262   if (scall == MAT_INITIAL_MATRIX) {
1263     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1264 
1265     /* Get some new tags to keep the communication clean */
1266     tag1 = ((PetscObject)C)->tag;
1267     CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag2));
1268     CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag3));
1269 
1270     /* evaluate communication - mesg to who, length of mesg, and buffer space
1271      required. Based on this, buffers are allocated, and data copied into them */
1272     CHKERRQ(PetscCalloc2(size,&w1,size,&w2));
1273     CHKERRQ(PetscMalloc1(nrow,&row2proc));
1274 
1275     /* w1[proc] = num of rows owned by proc -- to be requested */
1276     proc = 0;
1277     nrqs = 0; /* num of outgoing messages */
1278     for (j=0; j<nrow; j++) {
1279       row  = irow[j];
1280       if (!isrowsorted) proc = 0;
1281       while (row >= C->rmap->range[proc+1]) proc++;
1282       w1[proc]++;
1283       row2proc[j] = proc; /* map row index to proc */
1284 
1285       if (proc != rank && !w2[proc]) {
1286         w2[proc] = 1; nrqs++;
1287       }
1288     }
1289     w1[rank] = 0;  /* rows owned by self will not be requested */
1290 
1291     CHKERRQ(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/
1292     for (proc=0,j=0; proc<size; proc++) {
1293       if (w1[proc]) { pa[j++] = proc;}
1294     }
1295 
1296     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1297     msz = 0;              /* total mesg length (for all procs) */
1298     for (i=0; i<nrqs; i++) {
1299       proc      = pa[i];
1300       w1[proc] += 3;
1301       msz      += w1[proc];
1302     }
1303     CHKERRQ(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz));
1304 
1305     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1306     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1307     CHKERRQ(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
1308 
1309     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1310        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1311     CHKERRQ(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
1312 
1313     /* Now post the Irecvs corresponding to these messages */
1314     CHKERRQ(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1));
1315 
1316     CHKERRQ(PetscFree(onodes1));
1317     CHKERRQ(PetscFree(olengths1));
1318 
1319     /* Allocate Memory for outgoing messages */
1320     CHKERRQ(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
1321     CHKERRQ(PetscArrayzero(sbuf1,size));
1322     CHKERRQ(PetscArrayzero(ptr,size));
1323 
1324     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1325     iptr = tmp;
1326     for (i=0; i<nrqs; i++) {
1327       proc        = pa[i];
1328       sbuf1[proc] = iptr;
1329       iptr       += w1[proc];
1330     }
1331 
1332     /* Form the outgoing messages */
1333     /* Initialize the header space */
1334     for (i=0; i<nrqs; i++) {
1335       proc      = pa[i];
1336       CHKERRQ(PetscArrayzero(sbuf1[proc],3));
1337       ptr[proc] = sbuf1[proc] + 3;
1338     }
1339 
1340     /* Parse the isrow and copy data into outbuf */
1341     CHKERRQ(PetscArrayzero(ctr,size));
1342     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1343       proc = row2proc[j];
1344       if (proc != rank) { /* copy to the outgoing buf*/
1345         *ptr[proc] = irow[j];
1346         ctr[proc]++; ptr[proc]++;
1347       }
1348     }
1349 
1350     /* Update the headers for the current IS */
1351     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1352       if ((ctr_j = ctr[j])) {
1353         sbuf1_j        = sbuf1[j];
1354         k              = ++sbuf1_j[0];
1355         sbuf1_j[2*k]   = ctr_j;
1356         sbuf1_j[2*k-1] = 0;
1357       }
1358     }
1359 
1360     /* Now post the sends */
1361     CHKERRQ(PetscMalloc1(nrqs,&s_waits1));
1362     for (i=0; i<nrqs; ++i) {
1363       proc = pa[i];
1364       CHKERRMPI(MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i));
1365     }
1366 
1367     /* Post Receives to capture the buffer size */
1368     CHKERRQ(PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2));
1369     CHKERRQ(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3));
1370 
1371     if (nrqs) rbuf2[0] = tmp + msz;
1372     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1373 
1374     for (i=0; i<nrqs; ++i) {
1375       proc = pa[i];
1376       CHKERRMPI(MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i));
1377     }
1378 
1379     CHKERRQ(PetscFree2(w1,w2));
1380 
1381     /* Send to other procs the buf size they should allocate */
1382     /* Receive messages*/
1383     CHKERRQ(PetscMalloc1(nrqr,&r_status1));
1384     CHKERRQ(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1));
1385 
1386     CHKERRMPI(MPI_Waitall(nrqr,r_waits1,r_status1));
1387     for (i=0; i<nrqr; ++i) {
1388       req_size[i] = 0;
1389       rbuf1_i        = rbuf1[i];
1390       start          = 2*rbuf1_i[0] + 1;
1391       CHKERRMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end));
1392       CHKERRQ(PetscMalloc1(end,&sbuf2[i]));
1393       sbuf2_i        = sbuf2[i];
1394       for (j=start; j<end; j++) {
1395         k            = rbuf1_i[j] - rstart;
1396         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1397         sbuf2_i[j]   = ncols;
1398         req_size[i] += ncols;
1399       }
1400       req_source1[i] = r_status1[i].MPI_SOURCE;
1401 
1402       /* form the header */
1403       sbuf2_i[0] = req_size[i];
1404       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1405 
1406       CHKERRMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i));
1407     }
1408 
1409     CHKERRQ(PetscFree(r_status1));
1410     CHKERRQ(PetscFree(r_waits1));
1411 
1412     /* rbuf2 is received, Post recv column indices a->j */
1413     CHKERRMPI(MPI_Waitall(nrqs,r_waits2,r_status2));
1414 
1415     CHKERRQ(PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3));
1416     for (i=0; i<nrqs; ++i) {
1417       CHKERRQ(PetscMalloc1(rbuf2[i][0],&rbuf3[i]));
1418       req_source2[i] = r_status2[i].MPI_SOURCE;
1419       CHKERRMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i));
1420     }
1421 
1422     /* Wait on sends1 and sends2 */
1423     CHKERRQ(PetscMalloc1(nrqs,&s_status1));
1424     CHKERRMPI(MPI_Waitall(nrqs,s_waits1,s_status1));
1425     CHKERRQ(PetscFree(s_waits1));
1426     CHKERRQ(PetscFree(s_status1));
1427 
1428     CHKERRMPI(MPI_Waitall(nrqr,s_waits2,s_status2));
1429     CHKERRQ(PetscFree4(r_status2,s_waits2,r_waits2,s_status2));
1430 
1431     /* Now allocate sending buffers for a->j, and send them off */
1432     CHKERRQ(PetscMalloc1(nrqr,&sbuf_aj));
1433     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1434     if (nrqr) CHKERRQ(PetscMalloc1(j,&sbuf_aj[0]));
1435     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1436 
1437     for (i=0; i<nrqr; i++) { /* for each requested message */
1438       rbuf1_i   = rbuf1[i];
1439       sbuf_aj_i = sbuf_aj[i];
1440       ct1       = 2*rbuf1_i[0] + 1;
1441       ct2       = 0;
1442       /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1443 
1444       kmax = rbuf1[i][2];
1445       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1446         row    = rbuf1_i[ct1] - rstart;
1447         nzA    = ai[row+1] - ai[row];
1448         nzB    = bi[row+1] - bi[row];
1449         ncols  = nzA + nzB;
1450         cworkA = aj + ai[row]; cworkB = bj + bi[row];
1451 
1452         /* load the column indices for this row into cols*/
1453         cols = sbuf_aj_i + ct2;
1454 
1455         lwrite = 0;
1456         for (l=0; l<nzB; l++) {
1457           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1458         }
1459         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1460         for (l=0; l<nzB; l++) {
1461           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1462         }
1463 
1464         ct2 += ncols;
1465       }
1466       CHKERRMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i));
1467     }
1468 
1469     /* create column map (cmap): global col of C -> local col of submat */
1470 #if defined(PETSC_USE_CTABLE)
1471     if (!allcolumns) {
1472       CHKERRQ(PetscTableCreate(ncol,C->cmap->N,&cmap));
1473       CHKERRQ(PetscCalloc1(C->cmap->n,&cmap_loc));
1474       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1475         if (icol[j] >= cstart && icol[j] <cend) {
1476           cmap_loc[icol[j] - cstart] = j+1;
1477         } else { /* use PetscTable for non-local col indices */
1478           CHKERRQ(PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES));
1479         }
1480       }
1481     } else {
1482       cmap     = NULL;
1483       cmap_loc = NULL;
1484     }
1485     CHKERRQ(PetscCalloc1(C->rmap->n,&rmap_loc));
1486 #else
1487     if (!allcolumns) {
1488       CHKERRQ(PetscCalloc1(C->cmap->N,&cmap));
1489       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1490     } else {
1491       cmap = NULL;
1492     }
1493 #endif
1494 
1495     /* Create lens for MatSeqAIJSetPreallocation() */
1496     CHKERRQ(PetscCalloc1(nrow,&lens));
1497 
1498     /* Compute lens from local part of C */
1499     for (j=0; j<nrow; j++) {
1500       row  = irow[j];
1501       proc = row2proc[j];
1502       if (proc == rank) {
1503         /* diagonal part A = c->A */
1504         ncols = ai[row-rstart+1] - ai[row-rstart];
1505         cols  = aj + ai[row-rstart];
1506         if (!allcolumns) {
1507           for (k=0; k<ncols; k++) {
1508 #if defined(PETSC_USE_CTABLE)
1509             tcol = cmap_loc[cols[k]];
1510 #else
1511             tcol = cmap[cols[k]+cstart];
1512 #endif
1513             if (tcol) lens[j]++;
1514           }
1515         } else { /* allcolumns */
1516           lens[j] = ncols;
1517         }
1518 
1519         /* off-diagonal part B = c->B */
1520         ncols = bi[row-rstart+1] - bi[row-rstart];
1521         cols  = bj + bi[row-rstart];
1522         if (!allcolumns) {
1523           for (k=0; k<ncols; k++) {
1524 #if defined(PETSC_USE_CTABLE)
1525             CHKERRQ(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol));
1526 #else
1527             tcol = cmap[bmap[cols[k]]];
1528 #endif
1529             if (tcol) lens[j]++;
1530           }
1531         } else { /* allcolumns */
1532           lens[j] += ncols;
1533         }
1534       }
1535     }
1536 
1537     /* Create row map (rmap): global row of C -> local row of submat */
1538 #if defined(PETSC_USE_CTABLE)
1539     CHKERRQ(PetscTableCreate(nrow,C->rmap->N,&rmap));
1540     for (j=0; j<nrow; j++) {
1541       row  = irow[j];
1542       proc = row2proc[j];
1543       if (proc == rank) { /* a local row */
1544         rmap_loc[row - rstart] = j;
1545       } else {
1546         CHKERRQ(PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES));
1547       }
1548     }
1549 #else
1550     CHKERRQ(PetscCalloc1(C->rmap->N,&rmap));
1551     for (j=0; j<nrow; j++) {
1552       rmap[irow[j]] = j;
1553     }
1554 #endif
1555 
1556     /* Update lens from offproc data */
1557     /* recv a->j is done */
1558     CHKERRMPI(MPI_Waitall(nrqs,r_waits3,r_status3));
1559     for (i=0; i<nrqs; i++) {
1560       proc    = pa[i];
1561       sbuf1_i = sbuf1[proc];
1562       /* jmax    = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1563       ct1     = 2 + 1;
1564       ct2     = 0;
1565       rbuf2_i = rbuf2[i]; /* received length of C->j */
1566       rbuf3_i = rbuf3[i]; /* received C->j */
1567 
1568       /* is_no  = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1569       max1   = sbuf1_i[2];
1570       for (k=0; k<max1; k++,ct1++) {
1571 #if defined(PETSC_USE_CTABLE)
1572         CHKERRQ(PetscTableFind(rmap,sbuf1_i[ct1]+1,&row));
1573         row--;
1574         PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1575 #else
1576         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1577 #endif
1578         /* Now, store row index of submat in sbuf1_i[ct1] */
1579         sbuf1_i[ct1] = row;
1580 
1581         nnz = rbuf2_i[ct1];
1582         if (!allcolumns) {
1583           for (l=0; l<nnz; l++,ct2++) {
1584 #if defined(PETSC_USE_CTABLE)
1585             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1586               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1587             } else {
1588               CHKERRQ(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol));
1589             }
1590 #else
1591             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1592 #endif
1593             if (tcol) lens[row]++;
1594           }
1595         } else { /* allcolumns */
1596           lens[row] += nnz;
1597         }
1598       }
1599     }
1600     CHKERRMPI(MPI_Waitall(nrqr,s_waits3,s_status3));
1601     CHKERRQ(PetscFree4(r_waits3,s_waits3,r_status3,s_status3));
1602 
1603     /* Create the submatrices */
1604     CHKERRQ(MatCreate(PETSC_COMM_SELF,&submat));
1605     CHKERRQ(MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE));
1606 
1607     CHKERRQ(ISGetBlockSize(isrow[0],&i));
1608     CHKERRQ(ISGetBlockSize(iscol[0],&j));
1609     CHKERRQ(MatSetBlockSizes(submat,i,j));
1610     CHKERRQ(MatSetType(submat,((PetscObject)A)->type_name));
1611     CHKERRQ(MatSeqAIJSetPreallocation(submat,0,lens));
1612 
1613     /* create struct Mat_SubSppt and attached it to submat */
1614     CHKERRQ(PetscNew(&smatis1));
1615     subc = (Mat_SeqAIJ*)submat->data;
1616     subc->submatis1 = smatis1;
1617 
1618     smatis1->id          = 0;
1619     smatis1->nrqs        = nrqs;
1620     smatis1->nrqr        = nrqr;
1621     smatis1->rbuf1       = rbuf1;
1622     smatis1->rbuf2       = rbuf2;
1623     smatis1->rbuf3       = rbuf3;
1624     smatis1->sbuf2       = sbuf2;
1625     smatis1->req_source2 = req_source2;
1626 
1627     smatis1->sbuf1       = sbuf1;
1628     smatis1->ptr         = ptr;
1629     smatis1->tmp         = tmp;
1630     smatis1->ctr         = ctr;
1631 
1632     smatis1->pa           = pa;
1633     smatis1->req_size     = req_size;
1634     smatis1->req_source1  = req_source1;
1635 
1636     smatis1->allcolumns  = allcolumns;
1637     smatis1->singleis    = PETSC_TRUE;
1638     smatis1->row2proc    = row2proc;
1639     smatis1->rmap        = rmap;
1640     smatis1->cmap        = cmap;
1641 #if defined(PETSC_USE_CTABLE)
1642     smatis1->rmap_loc    = rmap_loc;
1643     smatis1->cmap_loc    = cmap_loc;
1644 #endif
1645 
1646     smatis1->destroy     = submat->ops->destroy;
1647     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1648     submat->factortype   = C->factortype;
1649 
1650     /* compute rmax */
1651     rmax = 0;
1652     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1653 
1654   } else { /* scall == MAT_REUSE_MATRIX */
1655     submat = submats[0];
1656     PetscCheckFalse(submat->rmap->n != nrow || submat->cmap->n != ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1657 
1658     subc    = (Mat_SeqAIJ*)submat->data;
1659     rmax    = subc->rmax;
1660     smatis1 = subc->submatis1;
1661     nrqs        = smatis1->nrqs;
1662     nrqr        = smatis1->nrqr;
1663     rbuf1       = smatis1->rbuf1;
1664     rbuf2       = smatis1->rbuf2;
1665     rbuf3       = smatis1->rbuf3;
1666     req_source2 = smatis1->req_source2;
1667 
1668     sbuf1     = smatis1->sbuf1;
1669     sbuf2     = smatis1->sbuf2;
1670     ptr       = smatis1->ptr;
1671     tmp       = smatis1->tmp;
1672     ctr       = smatis1->ctr;
1673 
1674     pa         = smatis1->pa;
1675     req_size   = smatis1->req_size;
1676     req_source1 = smatis1->req_source1;
1677 
1678     allcolumns = smatis1->allcolumns;
1679     row2proc   = smatis1->row2proc;
1680     rmap       = smatis1->rmap;
1681     cmap       = smatis1->cmap;
1682 #if defined(PETSC_USE_CTABLE)
1683     rmap_loc   = smatis1->rmap_loc;
1684     cmap_loc   = smatis1->cmap_loc;
1685 #endif
1686   }
1687 
1688   /* Post recv matrix values */
1689   CHKERRQ(PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals));
1690   CHKERRQ(PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4));
1691   CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag4));
1692   for (i=0; i<nrqs; ++i) {
1693     CHKERRQ(PetscMalloc1(rbuf2[i][0],&rbuf4[i]));
1694     CHKERRMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i));
1695   }
1696 
1697   /* Allocate sending buffers for a->a, and send them off */
1698   CHKERRQ(PetscMalloc1(nrqr,&sbuf_aa));
1699   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1700   if (nrqr) CHKERRQ(PetscMalloc1(j,&sbuf_aa[0]));
1701   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1702 
1703   for (i=0; i<nrqr; i++) {
1704     rbuf1_i   = rbuf1[i];
1705     sbuf_aa_i = sbuf_aa[i];
1706     ct1       = 2*rbuf1_i[0]+1;
1707     ct2       = 0;
1708     /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1709 
1710     kmax = rbuf1_i[2];
1711     for (k=0; k<kmax; k++,ct1++) {
1712       row = rbuf1_i[ct1] - rstart;
1713       nzA = ai[row+1] - ai[row];
1714       nzB = bi[row+1] - bi[row];
1715       ncols  = nzA + nzB;
1716       cworkB = bj + bi[row];
1717       vworkA = a_a + ai[row];
1718       vworkB = b_a + bi[row];
1719 
1720       /* load the column values for this row into vals*/
1721       vals = sbuf_aa_i + ct2;
1722 
1723       lwrite = 0;
1724       for (l=0; l<nzB; l++) {
1725         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1726       }
1727       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1728       for (l=0; l<nzB; l++) {
1729         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1730       }
1731 
1732       ct2 += ncols;
1733     }
1734     CHKERRMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i));
1735   }
1736 
1737   /* Assemble submat */
1738   /* First assemble the local rows */
1739   for (j=0; j<nrow; j++) {
1740     row  = irow[j];
1741     proc = row2proc[j];
1742     if (proc == rank) {
1743       Crow = row - rstart;  /* local row index of C */
1744 #if defined(PETSC_USE_CTABLE)
1745       row = rmap_loc[Crow]; /* row index of submat */
1746 #else
1747       row = rmap[row];
1748 #endif
1749 
1750       if (allcolumns) {
1751         /* diagonal part A = c->A */
1752         ncols = ai[Crow+1] - ai[Crow];
1753         cols  = aj + ai[Crow];
1754         vals  = a_a + ai[Crow];
1755         i     = 0;
1756         for (k=0; k<ncols; k++) {
1757           subcols[i]   = cols[k] + cstart;
1758           subvals[i++] = vals[k];
1759         }
1760 
1761         /* off-diagonal part B = c->B */
1762         ncols = bi[Crow+1] - bi[Crow];
1763         cols  = bj + bi[Crow];
1764         vals  = b_a + bi[Crow];
1765         for (k=0; k<ncols; k++) {
1766           subcols[i]   = bmap[cols[k]];
1767           subvals[i++] = vals[k];
1768         }
1769 
1770         CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES));
1771 
1772       } else { /* !allcolumns */
1773 #if defined(PETSC_USE_CTABLE)
1774         /* diagonal part A = c->A */
1775         ncols = ai[Crow+1] - ai[Crow];
1776         cols  = aj + ai[Crow];
1777         vals  = a_a + ai[Crow];
1778         i     = 0;
1779         for (k=0; k<ncols; k++) {
1780           tcol = cmap_loc[cols[k]];
1781           if (tcol) {
1782             subcols[i]   = --tcol;
1783             subvals[i++] = vals[k];
1784           }
1785         }
1786 
1787         /* off-diagonal part B = c->B */
1788         ncols = bi[Crow+1] - bi[Crow];
1789         cols  = bj + bi[Crow];
1790         vals  = b_a + bi[Crow];
1791         for (k=0; k<ncols; k++) {
1792           CHKERRQ(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol));
1793           if (tcol) {
1794             subcols[i]   = --tcol;
1795             subvals[i++] = vals[k];
1796           }
1797         }
1798 #else
1799         /* diagonal part A = c->A */
1800         ncols = ai[Crow+1] - ai[Crow];
1801         cols  = aj + ai[Crow];
1802         vals  = a_a + ai[Crow];
1803         i     = 0;
1804         for (k=0; k<ncols; k++) {
1805           tcol = cmap[cols[k]+cstart];
1806           if (tcol) {
1807             subcols[i]   = --tcol;
1808             subvals[i++] = vals[k];
1809           }
1810         }
1811 
1812         /* off-diagonal part B = c->B */
1813         ncols = bi[Crow+1] - bi[Crow];
1814         cols  = bj + bi[Crow];
1815         vals  = b_a + bi[Crow];
1816         for (k=0; k<ncols; k++) {
1817           tcol = cmap[bmap[cols[k]]];
1818           if (tcol) {
1819             subcols[i]   = --tcol;
1820             subvals[i++] = vals[k];
1821           }
1822         }
1823 #endif
1824         CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES));
1825       }
1826     }
1827   }
1828 
1829   /* Now assemble the off-proc rows */
1830   for (i=0; i<nrqs; i++) { /* for each requested message */
1831     /* recv values from other processes */
1832     CHKERRMPI(MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i));
1833     proc    = pa[idex];
1834     sbuf1_i = sbuf1[proc];
1835     /* jmax    = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1836     ct1     = 2 + 1;
1837     ct2     = 0; /* count of received C->j */
1838     ct3     = 0; /* count of received C->j that will be inserted into submat */
1839     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1840     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1841     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1842 
1843     /* is_no = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1844     max1 = sbuf1_i[2];             /* num of rows */
1845     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1846       row = sbuf1_i[ct1]; /* row index of submat */
1847       if (!allcolumns) {
1848         idex = 0;
1849         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1850           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1851           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1852 #if defined(PETSC_USE_CTABLE)
1853             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1854               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1855             } else {
1856               CHKERRQ(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol));
1857             }
1858 #else
1859             tcol = cmap[rbuf3_i[ct2]];
1860 #endif
1861             if (tcol) {
1862               subcols[idex]   = --tcol; /* may not be sorted */
1863               subvals[idex++] = rbuf4_i[ct2];
1864 
1865               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1866                For reuse, we replace received C->j with index that should be inserted to submat */
1867               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1868             }
1869           }
1870           CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES));
1871         } else { /* scall == MAT_REUSE_MATRIX */
1872           submat = submats[0];
1873           subc   = (Mat_SeqAIJ*)submat->data;
1874 
1875           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1876           for (l=0; l<nnz; l++) {
1877             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1878             subvals[idex++] = rbuf4_i[ct2];
1879           }
1880 
1881           bj = subc->j + subc->i[row]; /* sorted column indices */
1882           CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES));
1883         }
1884       } else { /* allcolumns */
1885         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1886         CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES));
1887         ct2 += nnz;
1888       }
1889     }
1890   }
1891 
1892   /* sending a->a are done */
1893   CHKERRMPI(MPI_Waitall(nrqr,s_waits4,s_status4));
1894   CHKERRQ(PetscFree4(r_waits4,s_waits4,r_status4,s_status4));
1895 
1896   CHKERRQ(MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY));
1897   CHKERRQ(MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY));
1898   submats[0] = submat;
1899 
1900   /* Restore the indices */
1901   CHKERRQ(ISRestoreIndices(isrow[0],&irow));
1902   if (!allcolumns) {
1903     CHKERRQ(ISRestoreIndices(iscol[0],&icol));
1904   }
1905 
1906   /* Destroy allocated memory */
1907   for (i=0; i<nrqs; ++i) {
1908     CHKERRQ(PetscFree(rbuf4[i]));
1909   }
1910   CHKERRQ(PetscFree3(rbuf4,subcols,subvals));
1911   if (sbuf_aa) {
1912     CHKERRQ(PetscFree(sbuf_aa[0]));
1913     CHKERRQ(PetscFree(sbuf_aa));
1914   }
1915 
1916   if (scall == MAT_INITIAL_MATRIX) {
1917     CHKERRQ(PetscFree(lens));
1918     if (sbuf_aj) {
1919       CHKERRQ(PetscFree(sbuf_aj[0]));
1920       CHKERRQ(PetscFree(sbuf_aj));
1921     }
1922   }
1923   CHKERRQ(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a));
1924   CHKERRQ(MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a));
1925   PetscFunctionReturn(0);
1926 }
1927 
1928 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1929 {
1930   PetscInt       ncol;
1931   PetscBool      colflag,allcolumns=PETSC_FALSE;
1932 
1933   PetscFunctionBegin;
1934   /* Allocate memory to hold all the submatrices */
1935   if (scall == MAT_INITIAL_MATRIX) {
1936     CHKERRQ(PetscCalloc1(2,submat));
1937   }
1938 
1939   /* Check for special case: each processor gets entire matrix columns */
1940   CHKERRQ(ISIdentity(iscol[0],&colflag));
1941   CHKERRQ(ISGetLocalSize(iscol[0],&ncol));
1942   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1943 
1944   CHKERRQ(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat));
1945   PetscFunctionReturn(0);
1946 }
1947 
1948 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1949 {
1950   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1951   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1952   Mat_SeqAIJ     *subc;
1953   Mat_SubSppt    *smat;
1954 
1955   PetscFunctionBegin;
1956   /* Check for special case: each processor has a single IS */
1957   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1958     CHKERRQ(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat));
1959     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1960     PetscFunctionReturn(0);
1961   }
1962 
1963   /* Collect global wantallmatrix and nstages */
1964   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1965   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1966   if (!nmax) nmax = 1;
1967 
1968   if (scall == MAT_INITIAL_MATRIX) {
1969     /* Collect global wantallmatrix and nstages */
1970     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1971       CHKERRQ(ISIdentity(*isrow,&rowflag));
1972       CHKERRQ(ISIdentity(*iscol,&colflag));
1973       CHKERRQ(ISGetLocalSize(*isrow,&nrow));
1974       CHKERRQ(ISGetLocalSize(*iscol,&ncol));
1975       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1976         wantallmatrix = PETSC_TRUE;
1977 
1978         CHKERRQ(PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL));
1979       }
1980     }
1981 
1982     /* Determine the number of stages through which submatrices are done
1983        Each stage will extract nmax submatrices.
1984        nmax is determined by the matrix column dimension.
1985        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1986     */
1987     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1988 
1989     in[0] = -1*(PetscInt)wantallmatrix;
1990     in[1] = nstages;
1991     CHKERRMPI(MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
1992     wantallmatrix = (PetscBool)(-out[0]);
1993     nstages       = out[1]; /* Make sure every processor loops through the global nstages */
1994 
1995   } else { /* MAT_REUSE_MATRIX */
1996     if (ismax) {
1997       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
1998       smat = subc->submatis1;
1999     } else { /* (*submat)[0] is a dummy matrix */
2000       smat = (Mat_SubSppt*)(*submat)[0]->data;
2001     }
2002     if (!smat) {
2003       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2004       wantallmatrix = PETSC_TRUE;
2005     } else if (smat->singleis) {
2006       CHKERRQ(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat));
2007       PetscFunctionReturn(0);
2008     } else {
2009       nstages = smat->nstages;
2010     }
2011   }
2012 
2013   if (wantallmatrix) {
2014     CHKERRQ(MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat));
2015     PetscFunctionReturn(0);
2016   }
2017 
2018   /* Allocate memory to hold all the submatrices and dummy submatrices */
2019   if (scall == MAT_INITIAL_MATRIX) {
2020     CHKERRQ(PetscCalloc1(ismax+nstages,submat));
2021   }
2022 
2023   for (i=0,pos=0; i<nstages; i++) {
2024     if (pos+nmax <= ismax) max_no = nmax;
2025     else if (pos >= ismax) max_no = 0;
2026     else                   max_no = ismax-pos;
2027 
2028     CHKERRQ(MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos));
2029     if (!max_no) {
2030       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2031         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2032         smat->nstages = nstages;
2033       }
2034       pos++; /* advance to next dummy matrix if any */
2035     } else pos += max_no;
2036   }
2037 
2038   if (ismax && scall == MAT_INITIAL_MATRIX) {
2039     /* save nstages for reuse */
2040     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2041     smat = subc->submatis1;
2042     smat->nstages = nstages;
2043   }
2044   PetscFunctionReturn(0);
2045 }
2046 
2047 /* -------------------------------------------------------------------------*/
2048 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2049 {
2050   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2051   Mat            A  = c->A;
2052   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2053   const PetscInt **icol,**irow;
2054   PetscInt       *nrow,*ncol,start;
2055   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2056   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2057   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2058   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2059   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2060 #if defined(PETSC_USE_CTABLE)
2061   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2062 #else
2063   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2064 #endif
2065   const PetscInt *irow_i;
2066   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2067   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2068   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2069   MPI_Comm       comm;
2070   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2071   PetscMPIInt    *onodes1,*olengths1,end;
2072   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2073   Mat_SubSppt    *smat_i;
2074   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2075   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2076 
2077   PetscFunctionBegin;
2078   CHKERRQ(PetscObjectGetComm((PetscObject)C,&comm));
2079   size = c->size;
2080   rank = c->rank;
2081 
2082   CHKERRQ(PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns));
2083   CHKERRQ(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted));
2084 
2085   for (i=0; i<ismax; i++) {
2086     CHKERRQ(ISSorted(iscol[i],&issorted[i]));
2087     if (!issorted[i]) iscsorted = issorted[i];
2088 
2089     CHKERRQ(ISSorted(isrow[i],&issorted[i]));
2090 
2091     CHKERRQ(ISGetIndices(isrow[i],&irow[i]));
2092     CHKERRQ(ISGetLocalSize(isrow[i],&nrow[i]));
2093 
2094     /* Check for special case: allcolumn */
2095     CHKERRQ(ISIdentity(iscol[i],&colflag));
2096     CHKERRQ(ISGetLocalSize(iscol[i],&ncol[i]));
2097     if (colflag && ncol[i] == C->cmap->N) {
2098       allcolumns[i] = PETSC_TRUE;
2099       icol[i] = NULL;
2100     } else {
2101       allcolumns[i] = PETSC_FALSE;
2102       CHKERRQ(ISGetIndices(iscol[i],&icol[i]));
2103     }
2104   }
2105 
2106   if (scall == MAT_REUSE_MATRIX) {
2107     /* Assumes new rows are same length as the old rows */
2108     for (i=0; i<ismax; i++) {
2109       PetscCheckFalse(!submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i);
2110       subc = (Mat_SeqAIJ*)submats[i]->data;
2111       PetscCheckFalse((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2112 
2113       /* Initial matrix as if empty */
2114       CHKERRQ(PetscArrayzero(subc->ilen,submats[i]->rmap->n));
2115 
2116       smat_i   = subc->submatis1;
2117 
2118       nrqs        = smat_i->nrqs;
2119       nrqr        = smat_i->nrqr;
2120       rbuf1       = smat_i->rbuf1;
2121       rbuf2       = smat_i->rbuf2;
2122       rbuf3       = smat_i->rbuf3;
2123       req_source2 = smat_i->req_source2;
2124 
2125       sbuf1     = smat_i->sbuf1;
2126       sbuf2     = smat_i->sbuf2;
2127       ptr       = smat_i->ptr;
2128       tmp       = smat_i->tmp;
2129       ctr       = smat_i->ctr;
2130 
2131       pa          = smat_i->pa;
2132       req_size    = smat_i->req_size;
2133       req_source1 = smat_i->req_source1;
2134 
2135       allcolumns[i] = smat_i->allcolumns;
2136       row2proc[i]   = smat_i->row2proc;
2137       rmap[i]       = smat_i->rmap;
2138       cmap[i]       = smat_i->cmap;
2139     }
2140 
2141     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2142       PetscCheckFalse(!submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2143       smat_i = (Mat_SubSppt*)submats[0]->data;
2144 
2145       nrqs        = smat_i->nrqs;
2146       nrqr        = smat_i->nrqr;
2147       rbuf1       = smat_i->rbuf1;
2148       rbuf2       = smat_i->rbuf2;
2149       rbuf3       = smat_i->rbuf3;
2150       req_source2 = smat_i->req_source2;
2151 
2152       sbuf1       = smat_i->sbuf1;
2153       sbuf2       = smat_i->sbuf2;
2154       ptr         = smat_i->ptr;
2155       tmp         = smat_i->tmp;
2156       ctr         = smat_i->ctr;
2157 
2158       pa          = smat_i->pa;
2159       req_size    = smat_i->req_size;
2160       req_source1 = smat_i->req_source1;
2161 
2162       allcolumns[0] = PETSC_FALSE;
2163     }
2164   } else { /* scall == MAT_INITIAL_MATRIX */
2165     /* Get some new tags to keep the communication clean */
2166     CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag2));
2167     CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag3));
2168 
2169     /* evaluate communication - mesg to who, length of mesg, and buffer space
2170      required. Based on this, buffers are allocated, and data copied into them*/
2171     CHKERRQ(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4));   /* mesg size, initialize work vectors */
2172 
2173     for (i=0; i<ismax; i++) {
2174       jmax   = nrow[i];
2175       irow_i = irow[i];
2176 
2177       CHKERRQ(PetscMalloc1(jmax,&row2proc_i));
2178       row2proc[i] = row2proc_i;
2179 
2180       if (issorted[i]) proc = 0;
2181       for (j=0; j<jmax; j++) {
2182         if (!issorted[i]) proc = 0;
2183         row = irow_i[j];
2184         while (row >= C->rmap->range[proc+1]) proc++;
2185         w4[proc]++;
2186         row2proc_i[j] = proc; /* map row index to proc */
2187       }
2188       for (j=0; j<size; j++) {
2189         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2190       }
2191     }
2192 
2193     nrqs     = 0;              /* no of outgoing messages */
2194     msz      = 0;              /* total mesg length (for all procs) */
2195     w1[rank] = 0;              /* no mesg sent to self */
2196     w3[rank] = 0;
2197     for (i=0; i<size; i++) {
2198       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2199     }
2200     CHKERRQ(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/
2201     for (i=0,j=0; i<size; i++) {
2202       if (w1[i]) { pa[j] = i; j++; }
2203     }
2204 
2205     /* Each message would have a header = 1 + 2*(no of IS) + data */
2206     for (i=0; i<nrqs; i++) {
2207       j      = pa[i];
2208       w1[j] += w2[j] + 2* w3[j];
2209       msz   += w1[j];
2210     }
2211     CHKERRQ(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz));
2212 
2213     /* Determine the number of messages to expect, their lengths, from from-ids */
2214     CHKERRQ(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr));
2215     CHKERRQ(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1));
2216 
2217     /* Now post the Irecvs corresponding to these messages */
2218     CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag0));
2219     CHKERRQ(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1));
2220 
2221     /* Allocate Memory for outgoing messages */
2222     CHKERRQ(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
2223     CHKERRQ(PetscArrayzero(sbuf1,size));
2224     CHKERRQ(PetscArrayzero(ptr,size));
2225 
2226     {
2227       PetscInt *iptr = tmp;
2228       k    = 0;
2229       for (i=0; i<nrqs; i++) {
2230         j        = pa[i];
2231         iptr    += k;
2232         sbuf1[j] = iptr;
2233         k        = w1[j];
2234       }
2235     }
2236 
2237     /* Form the outgoing messages. Initialize the header space */
2238     for (i=0; i<nrqs; i++) {
2239       j           = pa[i];
2240       sbuf1[j][0] = 0;
2241       CHKERRQ(PetscArrayzero(sbuf1[j]+1,2*w3[j]));
2242       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2243     }
2244 
2245     /* Parse the isrow and copy data into outbuf */
2246     for (i=0; i<ismax; i++) {
2247       row2proc_i = row2proc[i];
2248       CHKERRQ(PetscArrayzero(ctr,size));
2249       irow_i = irow[i];
2250       jmax   = nrow[i];
2251       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2252         proc = row2proc_i[j];
2253         if (proc != rank) { /* copy to the outgoing buf*/
2254           ctr[proc]++;
2255           *ptr[proc] = irow_i[j];
2256           ptr[proc]++;
2257         }
2258       }
2259       /* Update the headers for the current IS */
2260       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2261         if ((ctr_j = ctr[j])) {
2262           sbuf1_j        = sbuf1[j];
2263           k              = ++sbuf1_j[0];
2264           sbuf1_j[2*k]   = ctr_j;
2265           sbuf1_j[2*k-1] = i;
2266         }
2267       }
2268     }
2269 
2270     /*  Now  post the sends */
2271     CHKERRQ(PetscMalloc1(nrqs,&s_waits1));
2272     for (i=0; i<nrqs; ++i) {
2273       j    = pa[i];
2274       CHKERRMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i));
2275     }
2276 
2277     /* Post Receives to capture the buffer size */
2278     CHKERRQ(PetscMalloc1(nrqs,&r_waits2));
2279     CHKERRQ(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3));
2280     if (nrqs) rbuf2[0] = tmp + msz;
2281     for (i=1; i<nrqs; ++i) {
2282       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2283     }
2284     for (i=0; i<nrqs; ++i) {
2285       j    = pa[i];
2286       CHKERRMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i));
2287     }
2288 
2289     /* Send to other procs the buf size they should allocate */
2290     /* Receive messages*/
2291     CHKERRQ(PetscMalloc1(nrqr,&s_waits2));
2292     CHKERRQ(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1));
2293     {
2294       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2295       PetscInt   *sbuf2_i;
2296 
2297       CHKERRMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE));
2298       for (i=0; i<nrqr; ++i) {
2299         req_size[i] = 0;
2300         rbuf1_i        = rbuf1[i];
2301         start          = 2*rbuf1_i[0] + 1;
2302         end            = olengths1[i];
2303         CHKERRQ(PetscMalloc1(end,&sbuf2[i]));
2304         sbuf2_i        = sbuf2[i];
2305         for (j=start; j<end; j++) {
2306           id              = rbuf1_i[j] - rstart;
2307           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2308           sbuf2_i[j]      = ncols;
2309           req_size[i] += ncols;
2310         }
2311         req_source1[i] = onodes1[i];
2312         /* form the header */
2313         sbuf2_i[0] = req_size[i];
2314         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2315 
2316         CHKERRMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i));
2317       }
2318     }
2319 
2320     CHKERRQ(PetscFree(onodes1));
2321     CHKERRQ(PetscFree(olengths1));
2322     CHKERRQ(PetscFree(r_waits1));
2323     CHKERRQ(PetscFree4(w1,w2,w3,w4));
2324 
2325     /* Receive messages*/
2326     CHKERRQ(PetscMalloc1(nrqs,&r_waits3));
2327     CHKERRMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE));
2328     for (i=0; i<nrqs; ++i) {
2329       CHKERRQ(PetscMalloc1(rbuf2[i][0],&rbuf3[i]));
2330       req_source2[i] = pa[i];
2331       CHKERRMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i));
2332     }
2333     CHKERRQ(PetscFree(r_waits2));
2334 
2335     /* Wait on sends1 and sends2 */
2336     CHKERRMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE));
2337     CHKERRMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE));
2338     CHKERRQ(PetscFree(s_waits1));
2339     CHKERRQ(PetscFree(s_waits2));
2340 
2341     /* Now allocate sending buffers for a->j, and send them off */
2342     CHKERRQ(PetscMalloc1(nrqr,&sbuf_aj));
2343     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2344     if (nrqr) CHKERRQ(PetscMalloc1(j,&sbuf_aj[0]));
2345     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2346 
2347     CHKERRQ(PetscMalloc1(nrqr,&s_waits3));
2348     {
2349       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2350       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2351       PetscInt cend = C->cmap->rend;
2352       PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2353 
2354       for (i=0; i<nrqr; i++) {
2355         rbuf1_i   = rbuf1[i];
2356         sbuf_aj_i = sbuf_aj[i];
2357         ct1       = 2*rbuf1_i[0] + 1;
2358         ct2       = 0;
2359         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2360           kmax = rbuf1[i][2*j];
2361           for (k=0; k<kmax; k++,ct1++) {
2362             row    = rbuf1_i[ct1] - rstart;
2363             nzA    = a_i[row+1] - a_i[row];
2364             nzB    = b_i[row+1] - b_i[row];
2365             ncols  = nzA + nzB;
2366             cworkA = a_j + a_i[row];
2367             cworkB = b_j + b_i[row];
2368 
2369             /* load the column indices for this row into cols */
2370             cols = sbuf_aj_i + ct2;
2371 
2372             lwrite = 0;
2373             for (l=0; l<nzB; l++) {
2374               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2375             }
2376             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2377             for (l=0; l<nzB; l++) {
2378               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2379             }
2380 
2381             ct2 += ncols;
2382           }
2383         }
2384         CHKERRMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i));
2385       }
2386     }
2387 
2388     /* create col map: global col of C -> local col of submatrices */
2389     {
2390       const PetscInt *icol_i;
2391 #if defined(PETSC_USE_CTABLE)
2392       for (i=0; i<ismax; i++) {
2393         if (!allcolumns[i]) {
2394           CHKERRQ(PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]));
2395 
2396           jmax   = ncol[i];
2397           icol_i = icol[i];
2398           cmap_i = cmap[i];
2399           for (j=0; j<jmax; j++) {
2400             CHKERRQ(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES));
2401           }
2402         } else cmap[i] = NULL;
2403       }
2404 #else
2405       for (i=0; i<ismax; i++) {
2406         if (!allcolumns[i]) {
2407           CHKERRQ(PetscCalloc1(C->cmap->N,&cmap[i]));
2408           jmax   = ncol[i];
2409           icol_i = icol[i];
2410           cmap_i = cmap[i];
2411           for (j=0; j<jmax; j++) {
2412             cmap_i[icol_i[j]] = j+1;
2413           }
2414         } else cmap[i] = NULL;
2415       }
2416 #endif
2417     }
2418 
2419     /* Create lens which is required for MatCreate... */
2420     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2421     CHKERRQ(PetscMalloc1(ismax,&lens));
2422 
2423     if (ismax) {
2424       CHKERRQ(PetscCalloc1(j,&lens[0]));
2425     }
2426     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2427 
2428     /* Update lens from local data */
2429     for (i=0; i<ismax; i++) {
2430       row2proc_i = row2proc[i];
2431       jmax = nrow[i];
2432       if (!allcolumns[i]) cmap_i = cmap[i];
2433       irow_i = irow[i];
2434       lens_i = lens[i];
2435       for (j=0; j<jmax; j++) {
2436         row = irow_i[j];
2437         proc = row2proc_i[j];
2438         if (proc == rank) {
2439           CHKERRQ(MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL));
2440           if (!allcolumns[i]) {
2441             for (k=0; k<ncols; k++) {
2442 #if defined(PETSC_USE_CTABLE)
2443               CHKERRQ(PetscTableFind(cmap_i,cols[k]+1,&tcol));
2444 #else
2445               tcol = cmap_i[cols[k]];
2446 #endif
2447               if (tcol) lens_i[j]++;
2448             }
2449           } else { /* allcolumns */
2450             lens_i[j] = ncols;
2451           }
2452           CHKERRQ(MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL));
2453         }
2454       }
2455     }
2456 
2457     /* Create row map: global row of C -> local row of submatrices */
2458 #if defined(PETSC_USE_CTABLE)
2459     for (i=0; i<ismax; i++) {
2460       CHKERRQ(PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]));
2461       irow_i = irow[i];
2462       jmax   = nrow[i];
2463       for (j=0; j<jmax; j++) {
2464       CHKERRQ(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES));
2465       }
2466     }
2467 #else
2468     for (i=0; i<ismax; i++) {
2469       CHKERRQ(PetscCalloc1(C->rmap->N,&rmap[i]));
2470       rmap_i = rmap[i];
2471       irow_i = irow[i];
2472       jmax   = nrow[i];
2473       for (j=0; j<jmax; j++) {
2474         rmap_i[irow_i[j]] = j;
2475       }
2476     }
2477 #endif
2478 
2479     /* Update lens from offproc data */
2480     {
2481       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2482 
2483       CHKERRMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE));
2484       for (tmp2=0; tmp2<nrqs; tmp2++) {
2485         sbuf1_i = sbuf1[pa[tmp2]];
2486         jmax    = sbuf1_i[0];
2487         ct1     = 2*jmax+1;
2488         ct2     = 0;
2489         rbuf2_i = rbuf2[tmp2];
2490         rbuf3_i = rbuf3[tmp2];
2491         for (j=1; j<=jmax; j++) {
2492           is_no  = sbuf1_i[2*j-1];
2493           max1   = sbuf1_i[2*j];
2494           lens_i = lens[is_no];
2495           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2496           rmap_i = rmap[is_no];
2497           for (k=0; k<max1; k++,ct1++) {
2498 #if defined(PETSC_USE_CTABLE)
2499             CHKERRQ(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row));
2500             row--;
2501             PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2502 #else
2503             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2504 #endif
2505             max2 = rbuf2_i[ct1];
2506             for (l=0; l<max2; l++,ct2++) {
2507               if (!allcolumns[is_no]) {
2508 #if defined(PETSC_USE_CTABLE)
2509                 CHKERRQ(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
2510 #else
2511                 tcol = cmap_i[rbuf3_i[ct2]];
2512 #endif
2513                 if (tcol) lens_i[row]++;
2514               } else { /* allcolumns */
2515                 lens_i[row]++; /* lens_i[row] += max2 ? */
2516               }
2517             }
2518           }
2519         }
2520       }
2521     }
2522     CHKERRQ(PetscFree(r_waits3));
2523     CHKERRMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE));
2524     CHKERRQ(PetscFree(s_waits3));
2525 
2526     /* Create the submatrices */
2527     for (i=0; i<ismax; i++) {
2528       PetscInt    rbs,cbs;
2529 
2530       CHKERRQ(ISGetBlockSize(isrow[i],&rbs));
2531       CHKERRQ(ISGetBlockSize(iscol[i],&cbs));
2532 
2533       CHKERRQ(MatCreate(PETSC_COMM_SELF,submats+i));
2534       CHKERRQ(MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE));
2535 
2536       CHKERRQ(MatSetBlockSizes(submats[i],rbs,cbs));
2537       CHKERRQ(MatSetType(submats[i],((PetscObject)A)->type_name));
2538       CHKERRQ(MatSeqAIJSetPreallocation(submats[i],0,lens[i]));
2539 
2540       /* create struct Mat_SubSppt and attached it to submat */
2541       CHKERRQ(PetscNew(&smat_i));
2542       subc = (Mat_SeqAIJ*)submats[i]->data;
2543       subc->submatis1 = smat_i;
2544 
2545       smat_i->destroy          = submats[i]->ops->destroy;
2546       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2547       submats[i]->factortype   = C->factortype;
2548 
2549       smat_i->id          = i;
2550       smat_i->nrqs        = nrqs;
2551       smat_i->nrqr        = nrqr;
2552       smat_i->rbuf1       = rbuf1;
2553       smat_i->rbuf2       = rbuf2;
2554       smat_i->rbuf3       = rbuf3;
2555       smat_i->sbuf2       = sbuf2;
2556       smat_i->req_source2 = req_source2;
2557 
2558       smat_i->sbuf1       = sbuf1;
2559       smat_i->ptr         = ptr;
2560       smat_i->tmp         = tmp;
2561       smat_i->ctr         = ctr;
2562 
2563       smat_i->pa           = pa;
2564       smat_i->req_size     = req_size;
2565       smat_i->req_source1  = req_source1;
2566 
2567       smat_i->allcolumns  = allcolumns[i];
2568       smat_i->singleis    = PETSC_FALSE;
2569       smat_i->row2proc    = row2proc[i];
2570       smat_i->rmap        = rmap[i];
2571       smat_i->cmap        = cmap[i];
2572     }
2573 
2574     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2575       CHKERRQ(MatCreate(PETSC_COMM_SELF,&submats[0]));
2576       CHKERRQ(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE));
2577       CHKERRQ(MatSetType(submats[0],MATDUMMY));
2578 
2579       /* create struct Mat_SubSppt and attached it to submat */
2580       CHKERRQ(PetscNewLog(submats[0],&smat_i));
2581       submats[0]->data = (void*)smat_i;
2582 
2583       smat_i->destroy          = submats[0]->ops->destroy;
2584       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2585       submats[0]->factortype   = C->factortype;
2586 
2587       smat_i->id          = 0;
2588       smat_i->nrqs        = nrqs;
2589       smat_i->nrqr        = nrqr;
2590       smat_i->rbuf1       = rbuf1;
2591       smat_i->rbuf2       = rbuf2;
2592       smat_i->rbuf3       = rbuf3;
2593       smat_i->sbuf2       = sbuf2;
2594       smat_i->req_source2 = req_source2;
2595 
2596       smat_i->sbuf1       = sbuf1;
2597       smat_i->ptr         = ptr;
2598       smat_i->tmp         = tmp;
2599       smat_i->ctr         = ctr;
2600 
2601       smat_i->pa           = pa;
2602       smat_i->req_size     = req_size;
2603       smat_i->req_source1  = req_source1;
2604 
2605       smat_i->allcolumns  = PETSC_FALSE;
2606       smat_i->singleis    = PETSC_FALSE;
2607       smat_i->row2proc    = NULL;
2608       smat_i->rmap        = NULL;
2609       smat_i->cmap        = NULL;
2610     }
2611 
2612     if (ismax) CHKERRQ(PetscFree(lens[0]));
2613     CHKERRQ(PetscFree(lens));
2614     if (sbuf_aj) {
2615       CHKERRQ(PetscFree(sbuf_aj[0]));
2616       CHKERRQ(PetscFree(sbuf_aj));
2617     }
2618 
2619   } /* endof scall == MAT_INITIAL_MATRIX */
2620 
2621   /* Post recv matrix values */
2622   CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag4));
2623   CHKERRQ(PetscMalloc1(nrqs,&rbuf4));
2624   CHKERRQ(PetscMalloc1(nrqs,&r_waits4));
2625   for (i=0; i<nrqs; ++i) {
2626     CHKERRQ(PetscMalloc1(rbuf2[i][0],&rbuf4[i]));
2627     CHKERRMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i));
2628   }
2629 
2630   /* Allocate sending buffers for a->a, and send them off */
2631   CHKERRQ(PetscMalloc1(nrqr,&sbuf_aa));
2632   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2633   if (nrqr) CHKERRQ(PetscMalloc1(j,&sbuf_aa[0]));
2634   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2635 
2636   CHKERRQ(PetscMalloc1(nrqr,&s_waits4));
2637   {
2638     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2639     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2640     PetscInt    cend   = C->cmap->rend;
2641     PetscInt    *b_j   = b->j;
2642     PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2643 
2644     CHKERRQ(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a));
2645     CHKERRQ(MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a));
2646     for (i=0; i<nrqr; i++) {
2647       rbuf1_i   = rbuf1[i];
2648       sbuf_aa_i = sbuf_aa[i];
2649       ct1       = 2*rbuf1_i[0]+1;
2650       ct2       = 0;
2651       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2652         kmax = rbuf1_i[2*j];
2653         for (k=0; k<kmax; k++,ct1++) {
2654           row    = rbuf1_i[ct1] - rstart;
2655           nzA    = a_i[row+1] - a_i[row];
2656           nzB    = b_i[row+1] - b_i[row];
2657           ncols  = nzA + nzB;
2658           cworkB = b_j + b_i[row];
2659           vworkA = a_a + a_i[row];
2660           vworkB = b_a + b_i[row];
2661 
2662           /* load the column values for this row into vals*/
2663           vals = sbuf_aa_i+ct2;
2664 
2665           lwrite = 0;
2666           for (l=0; l<nzB; l++) {
2667             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2668           }
2669           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2670           for (l=0; l<nzB; l++) {
2671             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2672           }
2673 
2674           ct2 += ncols;
2675         }
2676       }
2677       CHKERRMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i));
2678     }
2679     CHKERRQ(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a));
2680     CHKERRQ(MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a));
2681   }
2682 
2683   /* Assemble the matrices */
2684   /* First assemble the local rows */
2685   for (i=0; i<ismax; i++) {
2686     row2proc_i = row2proc[i];
2687     subc      = (Mat_SeqAIJ*)submats[i]->data;
2688     imat_ilen = subc->ilen;
2689     imat_j    = subc->j;
2690     imat_i    = subc->i;
2691     imat_a    = subc->a;
2692 
2693     if (!allcolumns[i]) cmap_i = cmap[i];
2694     rmap_i = rmap[i];
2695     irow_i = irow[i];
2696     jmax   = nrow[i];
2697     for (j=0; j<jmax; j++) {
2698       row  = irow_i[j];
2699       proc = row2proc_i[j];
2700       if (proc == rank) {
2701         old_row = row;
2702 #if defined(PETSC_USE_CTABLE)
2703         CHKERRQ(PetscTableFind(rmap_i,row+1,&row));
2704         row--;
2705 #else
2706         row = rmap_i[row];
2707 #endif
2708         ilen_row = imat_ilen[row];
2709         CHKERRQ(MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals));
2710         mat_i    = imat_i[row];
2711         mat_a    = imat_a + mat_i;
2712         mat_j    = imat_j + mat_i;
2713         if (!allcolumns[i]) {
2714           for (k=0; k<ncols; k++) {
2715 #if defined(PETSC_USE_CTABLE)
2716             CHKERRQ(PetscTableFind(cmap_i,cols[k]+1,&tcol));
2717 #else
2718             tcol = cmap_i[cols[k]];
2719 #endif
2720             if (tcol) {
2721               *mat_j++ = tcol - 1;
2722               *mat_a++ = vals[k];
2723               ilen_row++;
2724             }
2725           }
2726         } else { /* allcolumns */
2727           for (k=0; k<ncols; k++) {
2728             *mat_j++ = cols[k];  /* global col index! */
2729             *mat_a++ = vals[k];
2730             ilen_row++;
2731           }
2732         }
2733         CHKERRQ(MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals));
2734 
2735         imat_ilen[row] = ilen_row;
2736       }
2737     }
2738   }
2739 
2740   /* Now assemble the off proc rows */
2741   CHKERRMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE));
2742   for (tmp2=0; tmp2<nrqs; tmp2++) {
2743     sbuf1_i = sbuf1[pa[tmp2]];
2744     jmax    = sbuf1_i[0];
2745     ct1     = 2*jmax + 1;
2746     ct2     = 0;
2747     rbuf2_i = rbuf2[tmp2];
2748     rbuf3_i = rbuf3[tmp2];
2749     rbuf4_i = rbuf4[tmp2];
2750     for (j=1; j<=jmax; j++) {
2751       is_no     = sbuf1_i[2*j-1];
2752       rmap_i    = rmap[is_no];
2753       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2754       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2755       imat_ilen = subc->ilen;
2756       imat_j    = subc->j;
2757       imat_i    = subc->i;
2758       imat_a    = subc->a;
2759       max1      = sbuf1_i[2*j];
2760       for (k=0; k<max1; k++,ct1++) {
2761         row = sbuf1_i[ct1];
2762 #if defined(PETSC_USE_CTABLE)
2763         CHKERRQ(PetscTableFind(rmap_i,row+1,&row));
2764         row--;
2765 #else
2766         row = rmap_i[row];
2767 #endif
2768         ilen  = imat_ilen[row];
2769         mat_i = imat_i[row];
2770         mat_a = imat_a + mat_i;
2771         mat_j = imat_j + mat_i;
2772         max2  = rbuf2_i[ct1];
2773         if (!allcolumns[is_no]) {
2774           for (l=0; l<max2; l++,ct2++) {
2775 #if defined(PETSC_USE_CTABLE)
2776             CHKERRQ(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol));
2777 #else
2778             tcol = cmap_i[rbuf3_i[ct2]];
2779 #endif
2780             if (tcol) {
2781               *mat_j++ = tcol - 1;
2782               *mat_a++ = rbuf4_i[ct2];
2783               ilen++;
2784             }
2785           }
2786         } else { /* allcolumns */
2787           for (l=0; l<max2; l++,ct2++) {
2788             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2789             *mat_a++ = rbuf4_i[ct2];
2790             ilen++;
2791           }
2792         }
2793         imat_ilen[row] = ilen;
2794       }
2795     }
2796   }
2797 
2798   if (!iscsorted) { /* sort column indices of the rows */
2799     for (i=0; i<ismax; i++) {
2800       subc      = (Mat_SeqAIJ*)submats[i]->data;
2801       imat_j    = subc->j;
2802       imat_i    = subc->i;
2803       imat_a    = subc->a;
2804       imat_ilen = subc->ilen;
2805 
2806       if (allcolumns[i]) continue;
2807       jmax = nrow[i];
2808       for (j=0; j<jmax; j++) {
2809         mat_i = imat_i[j];
2810         mat_a = imat_a + mat_i;
2811         mat_j = imat_j + mat_i;
2812         CHKERRQ(PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a));
2813       }
2814     }
2815   }
2816 
2817   CHKERRQ(PetscFree(r_waits4));
2818   CHKERRMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE));
2819   CHKERRQ(PetscFree(s_waits4));
2820 
2821   /* Restore the indices */
2822   for (i=0; i<ismax; i++) {
2823     CHKERRQ(ISRestoreIndices(isrow[i],irow+i));
2824     if (!allcolumns[i]) {
2825       CHKERRQ(ISRestoreIndices(iscol[i],icol+i));
2826     }
2827   }
2828 
2829   for (i=0; i<ismax; i++) {
2830     CHKERRQ(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY));
2831     CHKERRQ(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY));
2832   }
2833 
2834   /* Destroy allocated memory */
2835   if (sbuf_aa) {
2836     CHKERRQ(PetscFree(sbuf_aa[0]));
2837     CHKERRQ(PetscFree(sbuf_aa));
2838   }
2839   CHKERRQ(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted));
2840 
2841   for (i=0; i<nrqs; ++i) {
2842     CHKERRQ(PetscFree(rbuf4[i]));
2843   }
2844   CHKERRQ(PetscFree(rbuf4));
2845 
2846   CHKERRQ(PetscFree4(row2proc,cmap,rmap,allcolumns));
2847   PetscFunctionReturn(0);
2848 }
2849 
2850 /*
2851  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2852  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2853  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2854  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2855  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2856  state, and needs to be "assembled" later by compressing B's column space.
2857 
2858  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2859  Following this call, C->A & C->B have been created, even if empty.
2860  */
2861 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2862 {
2863   /* If making this function public, change the error returned in this function away from _PLIB. */
2864   Mat_MPIAIJ     *aij;
2865   Mat_SeqAIJ     *Baij;
2866   PetscBool      seqaij,Bdisassembled;
2867   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2868   PetscScalar    v;
2869   const PetscInt *rowindices,*colindices;
2870 
2871   PetscFunctionBegin;
2872   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2873   if (A) {
2874     CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij));
2875     PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2876     if (rowemb) {
2877       CHKERRQ(ISGetLocalSize(rowemb,&m));
2878       PetscCheckFalse(m != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2879     } else {
2880       if (C->rmap->n != A->rmap->n) {
2881         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2882       }
2883     }
2884     if (dcolemb) {
2885       CHKERRQ(ISGetLocalSize(dcolemb,&n));
2886       PetscCheckFalse(n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n);
2887     } else {
2888       PetscCheckFalse(C->cmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2889     }
2890   }
2891   if (B) {
2892     CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij));
2893     PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2894     if (rowemb) {
2895       CHKERRQ(ISGetLocalSize(rowemb,&m));
2896       PetscCheckFalse(m != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n);
2897     } else {
2898       if (C->rmap->n != B->rmap->n) {
2899         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2900       }
2901     }
2902     if (ocolemb) {
2903       CHKERRQ(ISGetLocalSize(ocolemb,&n));
2904       PetscCheckFalse(n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n);
2905     } else {
2906       PetscCheckFalse(C->cmap->N - C->cmap->n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2907     }
2908   }
2909 
2910   aij = (Mat_MPIAIJ*)C->data;
2911   if (!aij->A) {
2912     /* Mimic parts of MatMPIAIJSetPreallocation() */
2913     CHKERRQ(MatCreate(PETSC_COMM_SELF,&aij->A));
2914     CHKERRQ(MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n));
2915     CHKERRQ(MatSetBlockSizesFromMats(aij->A,C,C));
2916     CHKERRQ(MatSetType(aij->A,MATSEQAIJ));
2917     CHKERRQ(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A));
2918   }
2919   if (A) {
2920     CHKERRQ(MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A));
2921   } else {
2922     CHKERRQ(MatSetUp(aij->A));
2923   }
2924   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2925     /*
2926       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2927       need to "disassemble" B -- convert it to using C's global indices.
2928       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2929 
2930       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2931       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2932 
2933       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2934       At least avoid calling MatSetValues() and the implied searches?
2935     */
2936 
2937     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2938 #if defined(PETSC_USE_CTABLE)
2939       CHKERRQ(PetscTableDestroy(&aij->colmap));
2940 #else
2941       CHKERRQ(PetscFree(aij->colmap));
2942       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2943       if (aij->B) {
2944         CHKERRQ(PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt)));
2945       }
2946 #endif
2947       ngcol = 0;
2948       if (aij->lvec) {
2949         CHKERRQ(VecGetSize(aij->lvec,&ngcol));
2950       }
2951       if (aij->garray) {
2952         CHKERRQ(PetscFree(aij->garray));
2953         CHKERRQ(PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt)));
2954       }
2955       CHKERRQ(VecDestroy(&aij->lvec));
2956       CHKERRQ(VecScatterDestroy(&aij->Mvctx));
2957     }
2958     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2959       CHKERRQ(MatDestroy(&aij->B));
2960     }
2961     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2962       CHKERRQ(MatZeroEntries(aij->B));
2963     }
2964   }
2965   Bdisassembled = PETSC_FALSE;
2966   if (!aij->B) {
2967     CHKERRQ(MatCreate(PETSC_COMM_SELF,&aij->B));
2968     CHKERRQ(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B));
2969     CHKERRQ(MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N));
2970     CHKERRQ(MatSetBlockSizesFromMats(aij->B,B,B));
2971     CHKERRQ(MatSetType(aij->B,MATSEQAIJ));
2972     Bdisassembled = PETSC_TRUE;
2973   }
2974   if (B) {
2975     Baij = (Mat_SeqAIJ*)B->data;
2976     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2977       CHKERRQ(PetscMalloc1(B->rmap->n,&nz));
2978       for (i=0; i<B->rmap->n; i++) {
2979         nz[i] = Baij->i[i+1] - Baij->i[i];
2980       }
2981       CHKERRQ(MatSeqAIJSetPreallocation(aij->B,0,nz));
2982       CHKERRQ(PetscFree(nz));
2983     }
2984 
2985     CHKERRQ(PetscLayoutGetRange(C->rmap,&rstart,&rend));
2986     shift = rend-rstart;
2987     count = 0;
2988     rowindices = NULL;
2989     colindices = NULL;
2990     if (rowemb) {
2991       CHKERRQ(ISGetIndices(rowemb,&rowindices));
2992     }
2993     if (ocolemb) {
2994       CHKERRQ(ISGetIndices(ocolemb,&colindices));
2995     }
2996     for (i=0; i<B->rmap->n; i++) {
2997       PetscInt row;
2998       row = i;
2999       if (rowindices) row = rowindices[i];
3000       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3001         col  = Baij->j[count];
3002         if (colindices) col = colindices[col];
3003         if (Bdisassembled && col>=rstart) col += shift;
3004         v    = Baij->a[count];
3005         CHKERRQ(MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES));
3006         ++count;
3007       }
3008     }
3009     /* No assembly for aij->B is necessary. */
3010     /* FIXME: set aij->B's nonzerostate correctly. */
3011   } else {
3012     CHKERRQ(MatSetUp(aij->B));
3013   }
3014   C->preallocated  = PETSC_TRUE;
3015   C->was_assembled = PETSC_FALSE;
3016   C->assembled     = PETSC_FALSE;
3017    /*
3018       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3019       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3020    */
3021   PetscFunctionReturn(0);
3022 }
3023 
3024 /*
3025   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3026  */
3027 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3028 {
3029   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3030 
3031   PetscFunctionBegin;
3032   PetscValidPointer(A,2);
3033   PetscValidPointer(B,3);
3034   /* FIXME: make sure C is assembled */
3035   *A = aij->A;
3036   *B = aij->B;
3037   /* Note that we don't incref *A and *B, so be careful! */
3038   PetscFunctionReturn(0);
3039 }
3040 
3041 /*
3042   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3043   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3044 */
3045 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3046                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3047                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3048                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3049                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3050 {
3051   PetscMPIInt    size,flag;
3052   PetscInt       i,ii,cismax,ispar;
3053   Mat            *A,*B;
3054   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3055 
3056   PetscFunctionBegin;
3057   if (!ismax) PetscFunctionReturn(0);
3058 
3059   for (i = 0, cismax = 0; i < ismax; ++i) {
3060     CHKERRMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag));
3061     PetscCheckFalse(flag != MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3062     CHKERRMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3063     if (size > 1) ++cismax;
3064   }
3065 
3066   /*
3067      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3068      ispar counts the number of parallel ISs across C's comm.
3069   */
3070   CHKERRMPI(MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
3071   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3072     CHKERRQ((*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat));
3073     PetscFunctionReturn(0);
3074   }
3075 
3076   /* if (ispar) */
3077   /*
3078     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3079     These are used to extract the off-diag portion of the resulting parallel matrix.
3080     The row IS for the off-diag portion is the same as for the diag portion,
3081     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3082   */
3083   CHKERRQ(PetscMalloc2(cismax,&cisrow,cismax,&ciscol));
3084   CHKERRQ(PetscMalloc1(cismax,&ciscol_p));
3085   for (i = 0, ii = 0; i < ismax; ++i) {
3086     CHKERRMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3087     if (size > 1) {
3088       /*
3089          TODO: This is the part that's ***NOT SCALABLE***.
3090          To fix this we need to extract just the indices of C's nonzero columns
3091          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3092          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3093          to be done without serializing on the IS list, so, most likely, it is best
3094          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3095       */
3096       CHKERRQ(ISGetNonlocalIS(iscol[i],&(ciscol[ii])));
3097       /* Now we have to
3098          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3099              were sorted on each rank, concatenated they might no longer be sorted;
3100          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3101              indices in the nondecreasing order to the original index positions.
3102          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3103       */
3104       CHKERRQ(ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii));
3105       CHKERRQ(ISSort(ciscol[ii]));
3106       ++ii;
3107     }
3108   }
3109   CHKERRQ(PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p));
3110   for (i = 0, ii = 0; i < ismax; ++i) {
3111     PetscInt       j,issize;
3112     const PetscInt *indices;
3113 
3114     /*
3115        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3116      */
3117     CHKERRQ(ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i));
3118     CHKERRQ(ISSort(isrow[i]));
3119     CHKERRQ(ISGetLocalSize(isrow[i],&issize));
3120     CHKERRQ(ISGetIndices(isrow[i],&indices));
3121     for (j = 1; j < issize; ++j) {
3122       PetscCheckFalse(indices[j] == indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3123     }
3124     CHKERRQ(ISRestoreIndices(isrow[i],&indices));
3125     CHKERRQ(ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i));
3126     CHKERRQ(ISSort(iscol[i]));
3127     CHKERRQ(ISGetLocalSize(iscol[i],&issize));
3128     CHKERRQ(ISGetIndices(iscol[i],&indices));
3129     for (j = 1; j < issize; ++j) {
3130       PetscCheckFalse(indices[j-1] == indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]);
3131     }
3132     CHKERRQ(ISRestoreIndices(iscol[i],&indices));
3133     CHKERRMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3134     if (size > 1) {
3135       cisrow[ii] = isrow[i];
3136       ++ii;
3137     }
3138   }
3139   /*
3140     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3141     array of sequential matrices underlying the resulting parallel matrices.
3142     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3143     contain duplicates.
3144 
3145     There are as many diag matrices as there are original index sets. There are only as many parallel
3146     and off-diag matrices, as there are parallel (comm size > 1) index sets.
3147 
3148     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3149     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3150       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3151       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3152       with A[i] and B[ii] extracted from the corresponding MPI submat.
3153     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3154       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3155       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3156       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3157       values into A[i] and B[ii] sitting inside the corresponding submat.
3158     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3159       A[i], B[ii], AA[i] or BB[ii] matrices.
3160   */
3161   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3162   if (scall == MAT_INITIAL_MATRIX) {
3163     CHKERRQ(PetscMalloc1(ismax,submat));
3164   }
3165 
3166   /* Now obtain the sequential A and B submatrices separately. */
3167   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3168   CHKERRQ((*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A));
3169   CHKERRQ((*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B));
3170 
3171   /*
3172     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3173     matrices A & B have been extracted directly into the parallel matrices containing them, or
3174     simply into the sequential matrix identical with the corresponding A (if size == 1).
3175     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3176     to have the same sparsity pattern.
3177     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3178     must be constructed for C. This is done by setseqmat(s).
3179   */
3180   for (i = 0, ii = 0; i < ismax; ++i) {
3181     /*
3182        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3183        That way we can avoid sorting and computing permutations when reusing.
3184        To this end:
3185         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3186         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3187           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3188     */
3189     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3190 
3191     CHKERRMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size));
3192     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3193     if (size > 1) {
3194       if (scall == MAT_INITIAL_MATRIX) {
3195         CHKERRQ(MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i));
3196         CHKERRQ(MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE));
3197         CHKERRQ(MatSetType((*submat)[i],MATMPIAIJ));
3198         CHKERRQ(PetscLayoutSetUp((*submat)[i]->rmap));
3199         CHKERRQ(PetscLayoutSetUp((*submat)[i]->cmap));
3200       }
3201       /*
3202         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3203       */
3204       {
3205         Mat AA = A[i],BB = B[ii];
3206 
3207         if (AA || BB) {
3208           CHKERRQ(setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB));
3209           CHKERRQ(MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY));
3210           CHKERRQ(MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY));
3211         }
3212         CHKERRQ(MatDestroy(&AA));
3213       }
3214       CHKERRQ(ISDestroy(ciscol+ii));
3215       CHKERRQ(ISDestroy(ciscol_p+ii));
3216       ++ii;
3217     } else { /* if (size == 1) */
3218       if (scall == MAT_REUSE_MATRIX) {
3219         CHKERRQ(MatDestroy(&(*submat)[i]));
3220       }
3221       if (isrow_p[i] || iscol_p[i]) {
3222         CHKERRQ(MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i));
3223         CHKERRQ(setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]));
3224         /* Otherwise A is extracted straight into (*submats)[i]. */
3225         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3226         CHKERRQ(MatDestroy(A+i));
3227       } else (*submat)[i] = A[i];
3228     }
3229     CHKERRQ(ISDestroy(&isrow_p[i]));
3230     CHKERRQ(ISDestroy(&iscol_p[i]));
3231   }
3232   CHKERRQ(PetscFree2(cisrow,ciscol));
3233   CHKERRQ(PetscFree2(isrow_p,iscol_p));
3234   CHKERRQ(PetscFree(ciscol_p));
3235   CHKERRQ(PetscFree(A));
3236   CHKERRQ(MatDestroySubMatrices(cismax,&B));
3237   PetscFunctionReturn(0);
3238 }
3239 
3240 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3241 {
3242   PetscFunctionBegin;
3243   CHKERRQ(MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ));
3244   PetscFunctionReturn(0);
3245 }
3246