xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 6ac5842e34eedc6428162d8d42bedaaf46eae34c)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
73   size = c->size;
74   rank = c->rank;
75   M    = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++) {
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
195       }
196       /* Update the headers for the current IS */
197       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
198         if ((ctr_j = ctr[j])) {
199           outdat_j        = outdat[j];
200           k               = ++outdat_j[0];
201           outdat_j[2*k]   = ctr_j;
202           outdat_j[2*k-1] = i;
203         }
204       }
205       isz[i] = isz_i;
206     }
207   }
208 
209   /*  Now  post the sends */
210   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
211   for (i=0; i<nrqs; ++i) {
212     j    = pa[i];
213     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
214   }
215 
216   /* No longer need the original indices*/
217   for (i=0; i<imax; ++i) {
218     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
219   }
220   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
221 
222   for (i=0; i<imax; ++i) {
223     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
224   }
225 
226   /* Do Local work*/
227   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
228 
229   /* Receive messages*/
230   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
231   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
232 
233   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
234   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
235 
236   /* Phase 1 sends are complete - deallocate buffers */
237   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
238   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
239 
240   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
241   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
242   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
243   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
244   ierr = PetscFree(rbuf);CHKERRQ(ierr);
245 
246 
247   /* Send the data back*/
248   /* Do a global reduction to know the buffer space req for incoming messages*/
249   {
250     PetscMPIInt *rw1;
251 
252     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
253     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
254 
255     for (i=0; i<nrqr; ++i) {
256       proc = recv_status[i].MPI_SOURCE;
257 
258       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
259       rw1[proc] = isz1[i];
260     }
261     ierr = PetscFree(onodes1);CHKERRQ(ierr);
262     ierr = PetscFree(olengths1);CHKERRQ(ierr);
263 
264     /* Determine the number of messages to expect, their lengths, from from-ids */
265     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
266     ierr = PetscFree(rw1);CHKERRQ(ierr);
267   }
268   /* Now post the Irecvs corresponding to these messages */
269   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
270 
271   /*  Now  post the sends */
272   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
273   for (i=0; i<nrqr; ++i) {
274     j    = recv_status[i].MPI_SOURCE;
275     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
276   }
277 
278   /* receive work done on other processors*/
279   {
280     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
281     PetscMPIInt idex;
282     PetscBT     table_i;
283     MPI_Status  *status2;
284 
285     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
286     for (i=0; i<nrqs; ++i) {
287       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
288       /* Process the message*/
289       rbuf2_i = rbuf2[idex];
290       ct1     = 2*rbuf2_i[0]+1;
291       jmax    = rbuf2[idex][0];
292       for (j=1; j<=jmax; j++) {
293         max     = rbuf2_i[2*j];
294         is_no   = rbuf2_i[2*j-1];
295         isz_i   = isz[is_no];
296         data_i  = data[is_no];
297         table_i = table[is_no];
298         for (k=0; k<max; k++,ct1++) {
299           row = rbuf2_i[ct1];
300           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
301         }
302         isz[is_no] = isz_i;
303       }
304     }
305 
306     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
307     ierr = PetscFree(status2);CHKERRQ(ierr);
308   }
309 
310   for (i=0; i<imax; ++i) {
311     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
312   }
313 
314   ierr = PetscFree(onodes2);CHKERRQ(ierr);
315   ierr = PetscFree(olengths2);CHKERRQ(ierr);
316 
317   ierr = PetscFree(pa);CHKERRQ(ierr);
318   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
320   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
321   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
323   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
324   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
325   ierr = PetscFree(s_status);CHKERRQ(ierr);
326   ierr = PetscFree(recv_status);CHKERRQ(ierr);
327   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
328   ierr = PetscFree(xdata);CHKERRQ(ierr);
329   ierr = PetscFree(isz1);CHKERRQ(ierr);
330   PetscFunctionReturn(0);
331 }
332 
333 #undef __FUNCT__
334 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
335 /*
336    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
337        the work on the local processor.
338 
339      Inputs:
340       C      - MAT_MPIAIJ;
341       imax - total no of index sets processed at a time;
342       table  - an array of char - size = m bits.
343 
344      Output:
345       isz    - array containing the count of the solution elements corresponding
346                to each index set;
347       data   - pointer to the solutions
348 */
349 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
350 {
351   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
352   Mat        A  = c->A,B = c->B;
353   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
354   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
355   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
356   PetscBT    table_i;
357 
358   PetscFunctionBegin;
359   rstart = C->rmap->rstart;
360   cstart = C->cmap->rstart;
361   ai     = a->i;
362   aj     = a->j;
363   bi     = b->i;
364   bj     = b->j;
365   garray = c->garray;
366 
367 
368   for (i=0; i<imax; i++) {
369     data_i  = data[i];
370     table_i = table[i];
371     isz_i   = isz[i];
372     for (j=0,max=isz[i]; j<max; j++) {
373       row   = data_i[j] - rstart;
374       start = ai[row];
375       end   = ai[row+1];
376       for (k=start; k<end; k++) { /* Amat */
377         val = aj[k] + cstart;
378         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
379       }
380       start = bi[row];
381       end   = bi[row+1];
382       for (k=start; k<end; k++) { /* Bmat */
383         val = garray[bj[k]];
384         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
385       }
386     }
387     isz[i] = isz_i;
388   }
389   PetscFunctionReturn(0);
390 }
391 
392 #undef __FUNCT__
393 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
394 /*
395       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
396          and return the output
397 
398          Input:
399            C    - the matrix
400            nrqr - no of messages being processed.
401            rbuf - an array of pointers to the recieved requests
402 
403          Output:
404            xdata - array of messages to be sent back
405            isz1  - size of each message
406 
407   For better efficiency perhaps we should malloc separately each xdata[i],
408 then if a remalloc is required we need only copy the data for that one row
409 rather then all previous rows as it is now where a single large chunck of
410 memory is used.
411 
412 */
413 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
414 {
415   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
416   Mat            A  = c->A,B = c->B;
417   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
418   PetscErrorCode ierr;
419   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
420   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
421   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
422   PetscInt       *rbuf_i,kmax,rbuf_0;
423   PetscBT        xtable;
424 
425   PetscFunctionBegin;
426   m      = C->rmap->N;
427   rstart = C->rmap->rstart;
428   cstart = C->cmap->rstart;
429   ai     = a->i;
430   aj     = a->j;
431   bi     = b->i;
432   bj     = b->j;
433   garray = c->garray;
434 
435 
436   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
437     rbuf_i =  rbuf[i];
438     rbuf_0 =  rbuf_i[0];
439     ct    += rbuf_0;
440     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
441   }
442 
443   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
444   else max1 = 1;
445   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
446   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
447   ++no_malloc;
448   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
449   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
450 
451   ct3 = 0;
452   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
453     rbuf_i =  rbuf[i];
454     rbuf_0 =  rbuf_i[0];
455     ct1    =  2*rbuf_0+1;
456     ct2    =  ct1;
457     ct3   += ct1;
458     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
459       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
460       oct2 = ct2;
461       kmax = rbuf_i[2*j];
462       for (k=0; k<kmax; k++,ct1++) {
463         row = rbuf_i[ct1];
464         if (!PetscBTLookupSet(xtable,row)) {
465           if (!(ct3 < mem_estimate)) {
466             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
467             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
468             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
469             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
470             xdata[0]     = tmp;
471             mem_estimate = new_estimate; ++no_malloc;
472             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
473           }
474           xdata[i][ct2++] = row;
475           ct3++;
476         }
477       }
478       for (k=oct2,max2=ct2; k<max2; k++) {
479         row   = xdata[i][k] - rstart;
480         start = ai[row];
481         end   = ai[row+1];
482         for (l=start; l<end; l++) {
483           val = aj[l] + cstart;
484           if (!PetscBTLookupSet(xtable,val)) {
485             if (!(ct3 < mem_estimate)) {
486               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
487               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
488               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
489               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
490               xdata[0]     = tmp;
491               mem_estimate = new_estimate; ++no_malloc;
492               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
493             }
494             xdata[i][ct2++] = val;
495             ct3++;
496           }
497         }
498         start = bi[row];
499         end   = bi[row+1];
500         for (l=start; l<end; l++) {
501           val = garray[bj[l]];
502           if (!PetscBTLookupSet(xtable,val)) {
503             if (!(ct3 < mem_estimate)) {
504               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
505               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
506               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
507               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
508               xdata[0]     = tmp;
509               mem_estimate = new_estimate; ++no_malloc;
510               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
511             }
512             xdata[i][ct2++] = val;
513             ct3++;
514           }
515         }
516       }
517       /* Update the header*/
518       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
519       xdata[i][2*j-1] = rbuf_i[2*j-1];
520     }
521     xdata[i][0] = rbuf_0;
522     xdata[i+1]  = xdata[i] + ct2;
523     isz1[i]     = ct2; /* size of each message */
524   }
525   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
526   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
527   PetscFunctionReturn(0);
528 }
529 /* -------------------------------------------------------------------------*/
530 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
531 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
532 /*
533     Every processor gets the entire matrix
534 */
535 #undef __FUNCT__
536 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
537 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
538 {
539   Mat            B;
540   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
541   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
542   PetscErrorCode ierr;
543   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
544   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
545   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
546   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
547 
548   PetscFunctionBegin;
549   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
550   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
551 
552   if (scall == MAT_INITIAL_MATRIX) {
553     /* ----------------------------------------------------------------
554          Tell every processor the number of nonzeros per row
555     */
556     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
557     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
558       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
559     }
560     sendcount = A->rmap->rend - A->rmap->rstart;
561     ierr      = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
562     for (i=0; i<size; i++) {
563       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
564       displs[i]     = A->rmap->range[i];
565     }
566 #if defined(PETSC_HAVE_MPI_IN_PLACE)
567     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
568 #else
569     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
570 #endif
571     /* ---------------------------------------------------------------
572          Create the sequential matrix of the same type as the local block diagonal
573     */
574     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
575     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
576     ierr  = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
577     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
578     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
579     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
580     **Bin = B;
581     b     = (Mat_SeqAIJ*)B->data;
582 
583     /*--------------------------------------------------------------------
584        Copy my part of matrix column indices over
585     */
586     sendcount  = ad->nz + bd->nz;
587     jsendbuf   = b->j + b->i[rstarts[rank]];
588     a_jsendbuf = ad->j;
589     b_jsendbuf = bd->j;
590     n          = A->rmap->rend - A->rmap->rstart;
591     cnt        = 0;
592     for (i=0; i<n; i++) {
593 
594       /* put in lower diagonal portion */
595       m = bd->i[i+1] - bd->i[i];
596       while (m > 0) {
597         /* is it above diagonal (in bd (compressed) numbering) */
598         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
599         jsendbuf[cnt++] = garray[*b_jsendbuf++];
600         m--;
601       }
602 
603       /* put in diagonal portion */
604       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
605         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
606       }
607 
608       /* put in upper diagonal portion */
609       while (m-- > 0) {
610         jsendbuf[cnt++] = garray[*b_jsendbuf++];
611       }
612     }
613     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
614 
615     /*--------------------------------------------------------------------
616        Gather all column indices to all processors
617     */
618     for (i=0; i<size; i++) {
619       recvcounts[i] = 0;
620       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
621         recvcounts[i] += lens[j];
622       }
623     }
624     displs[0] = 0;
625     for (i=1; i<size; i++) {
626       displs[i] = displs[i-1] + recvcounts[i-1];
627     }
628 #if defined(PETSC_HAVE_MPI_IN_PLACE)
629     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
630 #else
631     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
632 #endif
633     /*--------------------------------------------------------------------
634         Assemble the matrix into useable form (note numerical values not yet set)
635     */
636     /* set the b->ilen (length of each row) values */
637     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
638     /* set the b->i indices */
639     b->i[0] = 0;
640     for (i=1; i<=A->rmap->N; i++) {
641       b->i[i] = b->i[i-1] + lens[i-1];
642     }
643     ierr = PetscFree(lens);CHKERRQ(ierr);
644     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
645     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
646 
647   } else {
648     B = **Bin;
649     b = (Mat_SeqAIJ*)B->data;
650   }
651 
652   /*--------------------------------------------------------------------
653        Copy my part of matrix numerical values into the values location
654   */
655   if (flag == MAT_GET_VALUES) {
656     sendcount = ad->nz + bd->nz;
657     sendbuf   = b->a + b->i[rstarts[rank]];
658     a_sendbuf = ad->a;
659     b_sendbuf = bd->a;
660     b_sendj   = bd->j;
661     n         = A->rmap->rend - A->rmap->rstart;
662     cnt       = 0;
663     for (i=0; i<n; i++) {
664 
665       /* put in lower diagonal portion */
666       m = bd->i[i+1] - bd->i[i];
667       while (m > 0) {
668         /* is it above diagonal (in bd (compressed) numbering) */
669         if (garray[*b_sendj] > A->rmap->rstart + i) break;
670         sendbuf[cnt++] = *b_sendbuf++;
671         m--;
672         b_sendj++;
673       }
674 
675       /* put in diagonal portion */
676       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
677         sendbuf[cnt++] = *a_sendbuf++;
678       }
679 
680       /* put in upper diagonal portion */
681       while (m-- > 0) {
682         sendbuf[cnt++] = *b_sendbuf++;
683         b_sendj++;
684       }
685     }
686     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
687 
688     /* -----------------------------------------------------------------
689        Gather all numerical values to all processors
690     */
691     if (!recvcounts) {
692       ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
693     }
694     for (i=0; i<size; i++) {
695       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
696     }
697     displs[0] = 0;
698     for (i=1; i<size; i++) {
699       displs[i] = displs[i-1] + recvcounts[i-1];
700     }
701     recvbuf = b->a;
702 #if defined(PETSC_HAVE_MPI_IN_PLACE)
703     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
704 #else
705     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
706 #endif
707   }  /* endof (flag == MAT_GET_VALUES) */
708   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
709 
710   if (A->symmetric) {
711     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
712   } else if (A->hermitian) {
713     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
714   } else if (A->structurally_symmetric) {
715     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
716   }
717   PetscFunctionReturn(0);
718 }
719 
720 
721 
722 #undef __FUNCT__
723 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
724 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
725 {
726   PetscErrorCode ierr;
727   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
728   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
729 
730   PetscFunctionBegin;
731   /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
732   /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level.
733      However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that
734      take care to order the result correctly by assembling it with MatSetValues() (after preallocating).
735    */
736   for (i = 0; i < ismax; ++i) {
737     PetscBool sorted;
738     ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr);
739     if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i);
740   }
741 
742   /*
743        Check for special case: each processor gets entire matrix
744   */
745   if (ismax == 1 && C->rmap->N == C->cmap->N) {
746     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
747     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
748     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
749     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
750     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
751       wantallmatrix = PETSC_TRUE;
752 
753       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
754     }
755   }
756   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
757   if (twantallmatrix) {
758     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
759     PetscFunctionReturn(0);
760   }
761 
762   /* Allocate memory to hold all the submatrices */
763   if (scall != MAT_REUSE_MATRIX) {
764     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
765   }
766 
767   /* Check for special case: each processor gets entire matrix columns */
768   ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr);
769   for (i=0; i<ismax; i++) {
770     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
771     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
772     if (colflag && ncol == C->cmap->N) {
773       allcolumns[i] = PETSC_TRUE;
774     } else {
775       allcolumns[i] = PETSC_FALSE;
776     }
777   }
778 
779   /* Determine the number of stages through which submatrices are done */
780   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
781 
782   /*
783      Each stage will extract nmax submatrices.
784      nmax is determined by the matrix column dimension.
785      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
786   */
787   if (!nmax) nmax = 1;
788   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
789 
790   /* Make sure every processor loops through the nstages */
791   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
792 
793   for (i=0,pos=0; i<nstages; i++) {
794     if (pos+nmax <= ismax) max_no = nmax;
795     else if (pos == ismax) max_no = 0;
796     else                   max_no = ismax-pos;
797     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
798     pos += max_no;
799   }
800 
801   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
802   PetscFunctionReturn(0);
803 }
804 
805 /* -------------------------------------------------------------------------*/
806 #undef __FUNCT__
807 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
808 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
809 {
810   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
811   Mat            A  = c->A;
812   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
813   const PetscInt **icol,**irow;
814   PetscInt       *nrow,*ncol,start;
815   PetscErrorCode ierr;
816   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
817   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
818   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
819   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
820   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
821 #if defined(PETSC_USE_CTABLE)
822   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
823 #else
824   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
825 #endif
826   const PetscInt *irow_i;
827   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
828   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
829   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
830   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
831   MPI_Status     *r_status3,*r_status4,*s_status4;
832   MPI_Comm       comm;
833   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
834   PetscMPIInt    *onodes1,*olengths1;
835   PetscMPIInt    idex,idex2,end;
836 
837   PetscFunctionBegin;
838   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
839   tag0 = ((PetscObject)C)->tag;
840   size = c->size;
841   rank = c->rank;
842 
843   /* Get some new tags to keep the communication clean */
844   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
845   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
846   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
847 
848   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
849 
850   for (i=0; i<ismax; i++) {
851     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
852     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
853     if (allcolumns[i]) {
854       icol[i] = NULL;
855       ncol[i] = C->cmap->N;
856     } else {
857       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
858       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
859     }
860   }
861 
862   /* evaluate communication - mesg to who, length of mesg, and buffer space
863      required. Based on this, buffers are allocated, and data copied into them*/
864   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);   /* mesg size */
865   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
866   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
867   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
868   for (i=0; i<ismax; i++) {
869     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
870     jmax   = nrow[i];
871     irow_i = irow[i];
872     for (j=0; j<jmax; j++) {
873       l   = 0;
874       row = irow_i[j];
875       while (row >= C->rmap->range[l+1]) l++;
876       proc = l;
877       w4[proc]++;
878     }
879     for (j=0; j<size; j++) {
880       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
881     }
882   }
883 
884   nrqs     = 0;              /* no of outgoing messages */
885   msz      = 0;              /* total mesg length (for all procs) */
886   w1[rank] = 0;              /* no mesg sent to self */
887   w3[rank] = 0;
888   for (i=0; i<size; i++) {
889     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
890   }
891   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
892   for (i=0,j=0; i<size; i++) {
893     if (w1[i]) { pa[j] = i; j++; }
894   }
895 
896   /* Each message would have a header = 1 + 2*(no of IS) + data */
897   for (i=0; i<nrqs; i++) {
898     j      = pa[i];
899     w1[j] += w2[j] + 2* w3[j];
900     msz   += w1[j];
901   }
902   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
903 
904   /* Determine the number of messages to expect, their lengths, from from-ids */
905   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
906   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
907 
908   /* Now post the Irecvs corresponding to these messages */
909   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
910 
911   ierr = PetscFree(onodes1);CHKERRQ(ierr);
912   ierr = PetscFree(olengths1);CHKERRQ(ierr);
913 
914   /* Allocate Memory for outgoing messages */
915   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
916   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
917   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
918 
919   {
920     PetscInt *iptr = tmp,ict = 0;
921     for (i=0; i<nrqs; i++) {
922       j        = pa[i];
923       iptr    += ict;
924       sbuf1[j] = iptr;
925       ict      = w1[j];
926     }
927   }
928 
929   /* Form the outgoing messages */
930   /* Initialize the header space */
931   for (i=0; i<nrqs; i++) {
932     j           = pa[i];
933     sbuf1[j][0] = 0;
934     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
935     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
936   }
937 
938   /* Parse the isrow and copy data into outbuf */
939   for (i=0; i<ismax; i++) {
940     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
941     irow_i = irow[i];
942     jmax   = nrow[i];
943     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
944       l   = 0;
945       row = irow_i[j];
946       while (row >= C->rmap->range[l+1]) l++;
947       proc = l;
948       if (proc != rank) { /* copy to the outgoing buf*/
949         ctr[proc]++;
950         *ptr[proc] = row;
951         ptr[proc]++;
952       }
953     }
954     /* Update the headers for the current IS */
955     for (j=0; j<size; j++) { /* Can Optimise this loop too */
956       if ((ctr_j = ctr[j])) {
957         sbuf1_j        = sbuf1[j];
958         k              = ++sbuf1_j[0];
959         sbuf1_j[2*k]   = ctr_j;
960         sbuf1_j[2*k-1] = i;
961       }
962     }
963   }
964 
965   /*  Now  post the sends */
966   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
967   for (i=0; i<nrqs; ++i) {
968     j    = pa[i];
969     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
970   }
971 
972   /* Post Receives to capture the buffer size */
973   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
974   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
975   rbuf2[0] = tmp + msz;
976   for (i=1; i<nrqs; ++i) {
977     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
978   }
979   for (i=0; i<nrqs; ++i) {
980     j    = pa[i];
981     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
982   }
983 
984   /* Send to other procs the buf size they should allocate */
985 
986 
987   /* Receive messages*/
988   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
989   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
990   ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
991   {
992     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
993     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
994     PetscInt   *sbuf2_i;
995 
996     for (i=0; i<nrqr; ++i) {
997       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
998 
999       req_size[idex] = 0;
1000       rbuf1_i        = rbuf1[idex];
1001       start          = 2*rbuf1_i[0] + 1;
1002       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1003       ierr           = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
1004       sbuf2_i        = sbuf2[idex];
1005       for (j=start; j<end; j++) {
1006         id              = rbuf1_i[j] - rstart;
1007         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1008         sbuf2_i[j]      = ncols;
1009         req_size[idex] += ncols;
1010       }
1011       req_source[idex] = r_status1[i].MPI_SOURCE;
1012       /* form the header */
1013       sbuf2_i[0] = req_size[idex];
1014       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1015 
1016       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1017     }
1018   }
1019   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1020   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1021 
1022   /*  recv buffer sizes */
1023   /* Receive messages*/
1024 
1025   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1026   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1027   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1028   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1029   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1030 
1031   for (i=0; i<nrqs; ++i) {
1032     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1033     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1034     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1035     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1036     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1037   }
1038   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1039   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1040 
1041   /* Wait on sends1 and sends2 */
1042   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1043   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1044 
1045   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1046   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1047   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1048   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1049   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1050   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1051 
1052   /* Now allocate buffers for a->j, and send them off */
1053   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1054   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1055   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1056   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1057 
1058   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1059   {
1060     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1061     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1062     PetscInt cend = C->cmap->rend;
1063     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1064 
1065     for (i=0; i<nrqr; i++) {
1066       rbuf1_i   = rbuf1[i];
1067       sbuf_aj_i = sbuf_aj[i];
1068       ct1       = 2*rbuf1_i[0] + 1;
1069       ct2       = 0;
1070       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1071         kmax = rbuf1[i][2*j];
1072         for (k=0; k<kmax; k++,ct1++) {
1073           row    = rbuf1_i[ct1] - rstart;
1074           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1075           ncols  = nzA + nzB;
1076           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1077 
1078           /* load the column indices for this row into cols*/
1079           cols = sbuf_aj_i + ct2;
1080 
1081           lwrite = 0;
1082           for (l=0; l<nzB; l++) {
1083             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1084           }
1085           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1086           for (l=0; l<nzB; l++) {
1087             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1088           }
1089 
1090           ct2 += ncols;
1091         }
1092       }
1093       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1094     }
1095   }
1096   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1097   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1098 
1099   /* Allocate buffers for a->a, and send them off */
1100   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1101   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1102   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1103   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1104 
1105   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1106   {
1107     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1108     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1109     PetscInt    cend   = C->cmap->rend;
1110     PetscInt    *b_j   = b->j;
1111     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1112 
1113     for (i=0; i<nrqr; i++) {
1114       rbuf1_i   = rbuf1[i];
1115       sbuf_aa_i = sbuf_aa[i];
1116       ct1       = 2*rbuf1_i[0]+1;
1117       ct2       = 0;
1118       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1119         kmax = rbuf1_i[2*j];
1120         for (k=0; k<kmax; k++,ct1++) {
1121           row    = rbuf1_i[ct1] - rstart;
1122           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1123           ncols  = nzA + nzB;
1124           cworkB = b_j + b_i[row];
1125           vworkA = a_a + a_i[row];
1126           vworkB = b_a + b_i[row];
1127 
1128           /* load the column values for this row into vals*/
1129           vals = sbuf_aa_i+ct2;
1130 
1131           lwrite = 0;
1132           for (l=0; l<nzB; l++) {
1133             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1134           }
1135           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1136           for (l=0; l<nzB; l++) {
1137             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1138           }
1139 
1140           ct2 += ncols;
1141         }
1142       }
1143       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1144     }
1145   }
1146   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1147   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1148   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1149   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1150 
1151   /* Form the matrix */
1152   /* create col map: global col of C -> local col of submatrices */
1153   {
1154     const PetscInt *icol_i;
1155 #if defined(PETSC_USE_CTABLE)
1156     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1157     for (i=0; i<ismax; i++) {
1158       if (!allcolumns[i]) {
1159         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1160 
1161         jmax   = ncol[i];
1162         icol_i = icol[i];
1163         cmap_i = cmap[i];
1164         for (j=0; j<jmax; j++) {
1165           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1166         }
1167       } else {
1168         cmap[i] = NULL;
1169       }
1170     }
1171 #else
1172     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1173     for (i=0; i<ismax; i++) {
1174       if (!allcolumns[i]) {
1175         ierr   = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr);
1176         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1177         jmax   = ncol[i];
1178         icol_i = icol[i];
1179         cmap_i = cmap[i];
1180         for (j=0; j<jmax; j++) {
1181           cmap_i[icol_i[j]] = j+1;
1182         }
1183       } else {
1184         cmap[i] = NULL;
1185       }
1186     }
1187 #endif
1188   }
1189 
1190   /* Create lens which is required for MatCreate... */
1191   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1192   ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1193   if (ismax) {
1194     ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1195     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1196   }
1197   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1198 
1199   /* Update lens from local data */
1200   for (i=0; i<ismax; i++) {
1201     jmax = nrow[i];
1202     if (!allcolumns[i]) cmap_i = cmap[i];
1203     irow_i = irow[i];
1204     lens_i = lens[i];
1205     for (j=0; j<jmax; j++) {
1206       l   = 0;
1207       row = irow_i[j];
1208       while (row >= C->rmap->range[l+1]) l++;
1209       proc = l;
1210       if (proc == rank) {
1211         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1212         if (!allcolumns[i]) {
1213           for (k=0; k<ncols; k++) {
1214 #if defined(PETSC_USE_CTABLE)
1215             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1216 #else
1217             tcol = cmap_i[cols[k]];
1218 #endif
1219             if (tcol) lens_i[j]++;
1220           }
1221         } else { /* allcolumns */
1222           lens_i[j] = ncols;
1223         }
1224         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1225       }
1226     }
1227   }
1228 
1229   /* Create row map: global row of C -> local row of submatrices */
1230 #if defined(PETSC_USE_CTABLE)
1231   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1232   for (i=0; i<ismax; i++) {
1233     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1234     rmap_i = rmap[i];
1235     irow_i = irow[i];
1236     jmax   = nrow[i];
1237     for (j=0; j<jmax; j++) {
1238       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1239     }
1240   }
1241 #else
1242   ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1243   if (ismax) {
1244     ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1245     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1246   }
1247   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1248   for (i=0; i<ismax; i++) {
1249     rmap_i = rmap[i];
1250     irow_i = irow[i];
1251     jmax   = nrow[i];
1252     for (j=0; j<jmax; j++) {
1253       rmap_i[irow_i[j]] = j;
1254     }
1255   }
1256 #endif
1257 
1258   /* Update lens from offproc data */
1259   {
1260     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1261 
1262     for (tmp2=0; tmp2<nrqs; tmp2++) {
1263       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1264       idex    = pa[idex2];
1265       sbuf1_i = sbuf1[idex];
1266       jmax    = sbuf1_i[0];
1267       ct1     = 2*jmax+1;
1268       ct2     = 0;
1269       rbuf2_i = rbuf2[idex2];
1270       rbuf3_i = rbuf3[idex2];
1271       for (j=1; j<=jmax; j++) {
1272         is_no  = sbuf1_i[2*j-1];
1273         max1   = sbuf1_i[2*j];
1274         lens_i = lens[is_no];
1275         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1276         rmap_i = rmap[is_no];
1277         for (k=0; k<max1; k++,ct1++) {
1278 #if defined(PETSC_USE_CTABLE)
1279           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1280           row--;
1281           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1282 #else
1283           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1284 #endif
1285           max2 = rbuf2_i[ct1];
1286           for (l=0; l<max2; l++,ct2++) {
1287             if (!allcolumns[is_no]) {
1288 #if defined(PETSC_USE_CTABLE)
1289               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1290 #else
1291               tcol = cmap_i[rbuf3_i[ct2]];
1292 #endif
1293               if (tcol) lens_i[row]++;
1294             } else { /* allcolumns */
1295               lens_i[row]++; /* lens_i[row] += max2 ? */
1296             }
1297           }
1298         }
1299       }
1300     }
1301   }
1302   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1303   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1304   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1305   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1306   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1307 
1308   /* Create the submatrices */
1309   if (scall == MAT_REUSE_MATRIX) {
1310     PetscBool flag;
1311 
1312     /*
1313         Assumes new rows are same length as the old rows,hence bug!
1314     */
1315     for (i=0; i<ismax; i++) {
1316       mat = (Mat_SeqAIJ*)(submats[i]->data);
1317       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1318       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1319       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1320       /* Initial matrix as if empty */
1321       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1322 
1323       submats[i]->factortype = C->factortype;
1324     }
1325   } else {
1326     for (i=0; i<ismax; i++) {
1327       PetscInt rbs,cbs;
1328       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1329       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1330 
1331       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1332       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1333 
1334       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1335       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1336       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1337     }
1338   }
1339 
1340   /* Assemble the matrices */
1341   /* First assemble the local rows */
1342   {
1343     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1344     PetscScalar *imat_a;
1345 
1346     for (i=0; i<ismax; i++) {
1347       mat       = (Mat_SeqAIJ*)submats[i]->data;
1348       imat_ilen = mat->ilen;
1349       imat_j    = mat->j;
1350       imat_i    = mat->i;
1351       imat_a    = mat->a;
1352 
1353       if (!allcolumns[i]) cmap_i = cmap[i];
1354       rmap_i = rmap[i];
1355       irow_i = irow[i];
1356       jmax   = nrow[i];
1357       for (j=0; j<jmax; j++) {
1358         l   = 0;
1359         row = irow_i[j];
1360         while (row >= C->rmap->range[l+1]) l++;
1361         proc = l;
1362         if (proc == rank) {
1363           old_row = row;
1364 #if defined(PETSC_USE_CTABLE)
1365           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1366           row--;
1367 #else
1368           row = rmap_i[row];
1369 #endif
1370           ilen_row = imat_ilen[row];
1371           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1372           mat_i    = imat_i[row];
1373           mat_a    = imat_a + mat_i;
1374           mat_j    = imat_j + mat_i;
1375           if (!allcolumns[i]) {
1376             for (k=0; k<ncols; k++) {
1377 #if defined(PETSC_USE_CTABLE)
1378               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1379 #else
1380               tcol = cmap_i[cols[k]];
1381 #endif
1382               if (tcol) {
1383                 *mat_j++ = tcol - 1;
1384                 *mat_a++ = vals[k];
1385                 ilen_row++;
1386               }
1387             }
1388           } else { /* allcolumns */
1389             for (k=0; k<ncols; k++) {
1390               *mat_j++ = cols[k];  /* global col index! */
1391               *mat_a++ = vals[k];
1392               ilen_row++;
1393             }
1394           }
1395           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1396 
1397           imat_ilen[row] = ilen_row;
1398         }
1399       }
1400     }
1401   }
1402 
1403   /*   Now assemble the off proc rows*/
1404   {
1405     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1406     PetscInt    *imat_j,*imat_i;
1407     PetscScalar *imat_a,*rbuf4_i;
1408 
1409     for (tmp2=0; tmp2<nrqs; tmp2++) {
1410       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1411       idex    = pa[idex2];
1412       sbuf1_i = sbuf1[idex];
1413       jmax    = sbuf1_i[0];
1414       ct1     = 2*jmax + 1;
1415       ct2     = 0;
1416       rbuf2_i = rbuf2[idex2];
1417       rbuf3_i = rbuf3[idex2];
1418       rbuf4_i = rbuf4[idex2];
1419       for (j=1; j<=jmax; j++) {
1420         is_no     = sbuf1_i[2*j-1];
1421         rmap_i    = rmap[is_no];
1422         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1423         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1424         imat_ilen = mat->ilen;
1425         imat_j    = mat->j;
1426         imat_i    = mat->i;
1427         imat_a    = mat->a;
1428         max1      = sbuf1_i[2*j];
1429         for (k=0; k<max1; k++,ct1++) {
1430           row = sbuf1_i[ct1];
1431 #if defined(PETSC_USE_CTABLE)
1432           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1433           row--;
1434 #else
1435           row = rmap_i[row];
1436 #endif
1437           ilen  = imat_ilen[row];
1438           mat_i = imat_i[row];
1439           mat_a = imat_a + mat_i;
1440           mat_j = imat_j + mat_i;
1441           max2  = rbuf2_i[ct1];
1442           if (!allcolumns[is_no]) {
1443             for (l=0; l<max2; l++,ct2++) {
1444 
1445 #if defined(PETSC_USE_CTABLE)
1446               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1447 #else
1448               tcol = cmap_i[rbuf3_i[ct2]];
1449 #endif
1450               if (tcol) {
1451                 *mat_j++ = tcol - 1;
1452                 *mat_a++ = rbuf4_i[ct2];
1453                 ilen++;
1454               }
1455             }
1456           } else { /* allcolumns */
1457             for (l=0; l<max2; l++,ct2++) {
1458               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1459               *mat_a++ = rbuf4_i[ct2];
1460               ilen++;
1461             }
1462           }
1463           imat_ilen[row] = ilen;
1464         }
1465       }
1466     }
1467   }
1468 
1469   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1470   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1471   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1472   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1473   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1474 
1475   /* Restore the indices */
1476   for (i=0; i<ismax; i++) {
1477     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1478     if (!allcolumns[i]) {
1479       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1480     }
1481   }
1482 
1483   /* Destroy allocated memory */
1484   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1485   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1486   ierr = PetscFree(pa);CHKERRQ(ierr);
1487 
1488   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1489   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1490   for (i=0; i<nrqr; ++i) {
1491     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1492   }
1493   for (i=0; i<nrqs; ++i) {
1494     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1495     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1496   }
1497 
1498   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1499   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1500   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1501   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1502   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1503   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1504   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1505 
1506 #if defined(PETSC_USE_CTABLE)
1507   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1508 #else
1509   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1510 #endif
1511   ierr = PetscFree(rmap);CHKERRQ(ierr);
1512 
1513   for (i=0; i<ismax; i++) {
1514     if (!allcolumns[i]) {
1515 #if defined(PETSC_USE_CTABLE)
1516       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1517 #else
1518       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1519 #endif
1520     }
1521   }
1522   ierr = PetscFree(cmap);CHKERRQ(ierr);
1523   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1524   ierr = PetscFree(lens);CHKERRQ(ierr);
1525 
1526   for (i=0; i<ismax; i++) {
1527     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1528     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1529   }
1530   PetscFunctionReturn(0);
1531 }
1532 
1533 /*
1534  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1535  Be careful not to destroy them elsewhere.
1536  */
1537 #undef __FUNCT__
1538 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1539 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1540 {
1541   /* If making this function public, change the error returned in this function away from _PLIB. */
1542   PetscErrorCode ierr;
1543   Mat_MPIAIJ     *aij;
1544   PetscBool      seqaij;
1545 
1546   PetscFunctionBegin;
1547   /* Check to make sure the component matrices are compatible with C. */
1548   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1549   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1550   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1551   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1552   if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1553 
1554   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1555   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1556   ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr);
1557   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1558   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1559   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1560   ierr   = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1561   aij    = (Mat_MPIAIJ*)((*C)->data);
1562   aij->A = A;
1563   aij->B = B;
1564   ierr   = PetscLogObjectParent(*C,A);CHKERRQ(ierr);
1565   ierr   = PetscLogObjectParent(*C,B);CHKERRQ(ierr);
1566 
1567   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1568   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 #undef __FUNCT__
1573 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1574 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1575 {
1576   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1577 
1578   PetscFunctionBegin;
1579   PetscValidPointer(A,2);
1580   PetscValidPointer(B,3);
1581   *A = aij->A;
1582   *B = aij->B;
1583   /* Note that we don't incref *A and *B, so be careful! */
1584   PetscFunctionReturn(0);
1585 }
1586 
1587 #undef __FUNCT__
1588 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1589 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1590                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1591                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1592                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1593 {
1594   PetscErrorCode ierr;
1595   PetscMPIInt    size, flag;
1596   PetscInt       i,ii;
1597   PetscInt       ismax_c;
1598 
1599   PetscFunctionBegin;
1600   if (!ismax) PetscFunctionReturn(0);
1601 
1602   for (i = 0, ismax_c = 0; i < ismax; ++i) {
1603     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
1604     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1605     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1606     if (size > 1) ++ismax_c;
1607   }
1608   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1609     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1610   } else { /* if (ismax_c) */
1611     Mat         *A,*B;
1612     IS          *isrow_c, *iscol_c;
1613     PetscMPIInt size;
1614     /*
1615      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1616      array of sequential matrices underlying the resulting parallel matrices.
1617      Which arrays to allocate is based on the value of MatReuse scall.
1618      There are as many diag matrices as there are original index sets.
1619      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1620 
1621      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1622      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1623      will deposite the extracted diag and off-diag parts.
1624      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1625      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1626     */
1627 
1628     /* Parallel matrix array is allocated only if no reuse is taking place. */
1629     if (scall != MAT_REUSE_MATRIX) {
1630       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1631     } else {
1632       ierr = PetscMalloc(ismax*sizeof(Mat), &A);CHKERRQ(ierr);
1633       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B);CHKERRQ(ierr);
1634       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1635       for (i = 0, ii = 0; i < ismax; ++i) {
1636         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1637         if (size > 1) {
1638           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
1639           ++ii;
1640         } else A[i] = (*submat)[i];
1641       }
1642     }
1643     /*
1644      Construct the complements of the iscol ISs for parallel ISs only.
1645      These are used to extract the off-diag portion of the resulting parallel matrix.
1646      The row IS for the off-diag portion is the same as for the diag portion,
1647      so we merely alias the row IS, while skipping those that are sequential.
1648     */
1649     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c);CHKERRQ(ierr);
1650     for (i = 0, ii = 0; i < ismax; ++i) {
1651       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1652       if (size > 1) {
1653         isrow_c[ii] = isrow[i];
1654 
1655         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
1656         ++ii;
1657       }
1658     }
1659     /* Now obtain the sequential A and B submatrices separately. */
1660     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
1661     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
1662     for (ii = 0; ii < ismax_c; ++ii) {
1663       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
1664     }
1665     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
1666     /*
1667      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1668      have been extracted directly into the parallel matrices containing them, or
1669      simply into the sequential matrix identical with the corresponding A (if size == 1).
1670      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1671      A is put into the correct submat slot (if size == 1).
1672      */
1673     if (scall != MAT_REUSE_MATRIX) {
1674       for (i = 0, ii = 0; i < ismax; ++i) {
1675         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1676         if (size > 1) {
1677           /*
1678            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1679            */
1680           /* Construct submat[i] from the Seq pieces A and B. */
1681           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
1682 
1683           ++ii;
1684         } else (*submat)[i] = A[i];
1685       }
1686     }
1687     ierr = PetscFree(A);CHKERRQ(ierr);
1688     ierr = PetscFree(B);CHKERRQ(ierr);
1689   }
1690   PetscFunctionReturn(0);
1691 } /* MatGetSubMatricesParallel_MPIXAIJ() */
1692 
1693 
1694 
1695 #undef __FUNCT__
1696 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1697 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1698 {
1699   PetscErrorCode ierr;
1700 
1701   PetscFunctionBegin;
1702   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
1703   PetscFunctionReturn(0);
1704 }
1705