xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 2205254efee3a00a594e5e2a3a70f74dcb40bc03)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm = ((PetscObject)C)->comm;
73   size = c->size;
74   rank = c->rank;
75   M    = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++) {
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249   /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259 
260       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
261       rw1[proc] = isz1[i];
262     }
263     ierr = PetscFree(onodes1);CHKERRQ(ierr);
264     ierr = PetscFree(olengths1);CHKERRQ(ierr);
265 
266     /* Determine the number of messages to expect, their lengths, from from-ids */
267     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
268     ierr = PetscFree(rw1);CHKERRQ(ierr);
269   }
270   /* Now post the Irecvs corresponding to these messages */
271   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
272 
273   /*  Now  post the sends */
274   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
275   for (i=0; i<nrqr; ++i) {
276     j    = recv_status[i].MPI_SOURCE;
277     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
278   }
279 
280   /* receive work done on other processors*/
281   {
282     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
283     PetscMPIInt idex;
284     PetscBT     table_i;
285     MPI_Status  *status2;
286 
287     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
288     for (i=0; i<nrqs; ++i) {
289       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
290       /* Process the message*/
291       rbuf2_i = rbuf2[idex];
292       ct1     = 2*rbuf2_i[0]+1;
293       jmax    = rbuf2[idex][0];
294       for (j=1; j<=jmax; j++) {
295         max     = rbuf2_i[2*j];
296         is_no   = rbuf2_i[2*j-1];
297         isz_i   = isz[is_no];
298         data_i  = data[is_no];
299         table_i = table[is_no];
300         for (k=0; k<max; k++,ct1++) {
301           row = rbuf2_i[ct1];
302           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
303         }
304         isz[is_no] = isz_i;
305       }
306     }
307 
308     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
309     ierr = PetscFree(status2);CHKERRQ(ierr);
310   }
311 
312   for (i=0; i<imax; ++i) {
313     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
314   }
315 
316   ierr = PetscFree(onodes2);CHKERRQ(ierr);
317   ierr = PetscFree(olengths2);CHKERRQ(ierr);
318 
319   ierr = PetscFree(pa);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
321   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
322   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
324   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
325   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
326   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
327   ierr = PetscFree(s_status);CHKERRQ(ierr);
328   ierr = PetscFree(recv_status);CHKERRQ(ierr);
329   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
330   ierr = PetscFree(xdata);CHKERRQ(ierr);
331   ierr = PetscFree(isz1);CHKERRQ(ierr);
332   PetscFunctionReturn(0);
333 }
334 
335 #undef __FUNCT__
336 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
337 /*
338    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
339        the work on the local processor.
340 
341      Inputs:
342       C      - MAT_MPIAIJ;
343       imax - total no of index sets processed at a time;
344       table  - an array of char - size = m bits.
345 
346      Output:
347       isz    - array containing the count of the solution elements corresponding
348                to each index set;
349       data   - pointer to the solutions
350 */
351 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
352 {
353   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
354   Mat        A  = c->A,B = c->B;
355   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
356   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
357   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
358   PetscBT    table_i;
359 
360   PetscFunctionBegin;
361   rstart = C->rmap->rstart;
362   cstart = C->cmap->rstart;
363   ai     = a->i;
364   aj     = a->j;
365   bi     = b->i;
366   bj     = b->j;
367   garray = c->garray;
368 
369 
370   for (i=0; i<imax; i++) {
371     data_i  = data[i];
372     table_i = table[i];
373     isz_i   = isz[i];
374     for (j=0,max=isz[i]; j<max; j++) {
375       row   = data_i[j] - rstart;
376       start = ai[row];
377       end   = ai[row+1];
378       for (k=start; k<end; k++) { /* Amat */
379         val = aj[k] + cstart;
380         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
381       }
382       start = bi[row];
383       end   = bi[row+1];
384       for (k=start; k<end; k++) { /* Bmat */
385         val = garray[bj[k]];
386         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
387       }
388     }
389     isz[i] = isz_i;
390   }
391   PetscFunctionReturn(0);
392 }
393 
394 #undef __FUNCT__
395 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
396 /*
397       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
398          and return the output
399 
400          Input:
401            C    - the matrix
402            nrqr - no of messages being processed.
403            rbuf - an array of pointers to the recieved requests
404 
405          Output:
406            xdata - array of messages to be sent back
407            isz1  - size of each message
408 
409   For better efficiency perhaps we should malloc separately each xdata[i],
410 then if a remalloc is required we need only copy the data for that one row
411 rather then all previous rows as it is now where a single large chunck of
412 memory is used.
413 
414 */
415 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
416 {
417   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
418   Mat            A  = c->A,B = c->B;
419   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
420   PetscErrorCode ierr;
421   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
422   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
423   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
424   PetscInt       *rbuf_i,kmax,rbuf_0;
425   PetscBT        xtable;
426 
427   PetscFunctionBegin;
428   m      = C->rmap->N;
429   rstart = C->rmap->rstart;
430   cstart = C->cmap->rstart;
431   ai     = a->i;
432   aj     = a->j;
433   bi     = b->i;
434   bj     = b->j;
435   garray = c->garray;
436 
437 
438   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
439     rbuf_i =  rbuf[i];
440     rbuf_0 =  rbuf_i[0];
441     ct    += rbuf_0;
442     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
443   }
444 
445   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
446   else max1 = 1;
447   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
448   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
449   ++no_malloc;
450   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
451   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
452 
453   ct3 = 0;
454   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
455     rbuf_i =  rbuf[i];
456     rbuf_0 =  rbuf_i[0];
457     ct1    =  2*rbuf_0+1;
458     ct2    =  ct1;
459     ct3   += ct1;
460     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
461       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
462       oct2 = ct2;
463       kmax = rbuf_i[2*j];
464       for (k=0; k<kmax; k++,ct1++) {
465         row = rbuf_i[ct1];
466         if (!PetscBTLookupSet(xtable,row)) {
467           if (!(ct3 < mem_estimate)) {
468             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
469             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
470             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
471             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
472             xdata[0]     = tmp;
473             mem_estimate = new_estimate; ++no_malloc;
474             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
475           }
476           xdata[i][ct2++] = row;
477           ct3++;
478         }
479       }
480       for (k=oct2,max2=ct2; k<max2; k++) {
481         row   = xdata[i][k] - rstart;
482         start = ai[row];
483         end   = ai[row+1];
484         for (l=start; l<end; l++) {
485           val = aj[l] + cstart;
486           if (!PetscBTLookupSet(xtable,val)) {
487             if (!(ct3 < mem_estimate)) {
488               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
489               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
490               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
491               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
492               xdata[0]     = tmp;
493               mem_estimate = new_estimate; ++no_malloc;
494               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
495             }
496             xdata[i][ct2++] = val;
497             ct3++;
498           }
499         }
500         start = bi[row];
501         end   = bi[row+1];
502         for (l=start; l<end; l++) {
503           val = garray[bj[l]];
504           if (!PetscBTLookupSet(xtable,val)) {
505             if (!(ct3 < mem_estimate)) {
506               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
507               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
508               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
509               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
510               xdata[0]     = tmp;
511               mem_estimate = new_estimate; ++no_malloc;
512               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
513             }
514             xdata[i][ct2++] = val;
515             ct3++;
516           }
517         }
518       }
519       /* Update the header*/
520       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
521       xdata[i][2*j-1] = rbuf_i[2*j-1];
522     }
523     xdata[i][0] = rbuf_0;
524     xdata[i+1]  = xdata[i] + ct2;
525     isz1[i]     = ct2; /* size of each message */
526   }
527   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
528   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
529   PetscFunctionReturn(0);
530 }
531 /* -------------------------------------------------------------------------*/
532 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
533 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
534 /*
535     Every processor gets the entire matrix
536 */
537 #undef __FUNCT__
538 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
539 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
540 {
541   Mat            B;
542   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
543   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
544   PetscErrorCode ierr;
545   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
546   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
547   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
548   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
549 
550   PetscFunctionBegin;
551   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
552   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
553 
554   if (scall == MAT_INITIAL_MATRIX) {
555     /* ----------------------------------------------------------------
556          Tell every processor the number of nonzeros per row
557     */
558     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
559     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
560       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
561     }
562     sendcount = A->rmap->rend - A->rmap->rstart;
563     ierr      = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
564     for (i=0; i<size; i++) {
565       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
566       displs[i]     = A->rmap->range[i];
567     }
568 #if defined(PETSC_HAVE_MPI_IN_PLACE)
569     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
570 #else
571     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
572 #endif
573     /* ---------------------------------------------------------------
574          Create the sequential matrix of the same type as the local block diagonal
575     */
576     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
577     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
578     ierr  = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
579     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
580     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
581     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
582     **Bin = B;
583     b     = (Mat_SeqAIJ*)B->data;
584 
585     /*--------------------------------------------------------------------
586        Copy my part of matrix column indices over
587     */
588     sendcount  = ad->nz + bd->nz;
589     jsendbuf   = b->j + b->i[rstarts[rank]];
590     a_jsendbuf = ad->j;
591     b_jsendbuf = bd->j;
592     n          = A->rmap->rend - A->rmap->rstart;
593     cnt        = 0;
594     for (i=0; i<n; i++) {
595 
596       /* put in lower diagonal portion */
597       m = bd->i[i+1] - bd->i[i];
598       while (m > 0) {
599         /* is it above diagonal (in bd (compressed) numbering) */
600         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
601         jsendbuf[cnt++] = garray[*b_jsendbuf++];
602         m--;
603       }
604 
605       /* put in diagonal portion */
606       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
607         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
608       }
609 
610       /* put in upper diagonal portion */
611       while (m-- > 0) {
612         jsendbuf[cnt++] = garray[*b_jsendbuf++];
613       }
614     }
615     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
616 
617     /*--------------------------------------------------------------------
618        Gather all column indices to all processors
619     */
620     for (i=0; i<size; i++) {
621       recvcounts[i] = 0;
622       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
623         recvcounts[i] += lens[j];
624       }
625     }
626     displs[0] = 0;
627     for (i=1; i<size; i++) {
628       displs[i] = displs[i-1] + recvcounts[i-1];
629     }
630 #if defined(PETSC_HAVE_MPI_IN_PLACE)
631     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
632 #else
633     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
634 #endif
635     /*--------------------------------------------------------------------
636         Assemble the matrix into useable form (note numerical values not yet set)
637     */
638     /* set the b->ilen (length of each row) values */
639     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
640     /* set the b->i indices */
641     b->i[0] = 0;
642     for (i=1; i<=A->rmap->N; i++) {
643       b->i[i] = b->i[i-1] + lens[i-1];
644     }
645     ierr = PetscFree(lens);CHKERRQ(ierr);
646     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
648 
649   } else {
650     B = **Bin;
651     b = (Mat_SeqAIJ*)B->data;
652   }
653 
654   /*--------------------------------------------------------------------
655        Copy my part of matrix numerical values into the values location
656   */
657   if (flag == MAT_GET_VALUES) {
658     sendcount = ad->nz + bd->nz;
659     sendbuf   = b->a + b->i[rstarts[rank]];
660     a_sendbuf = ad->a;
661     b_sendbuf = bd->a;
662     b_sendj   = bd->j;
663     n         = A->rmap->rend - A->rmap->rstart;
664     cnt       = 0;
665     for (i=0; i<n; i++) {
666 
667       /* put in lower diagonal portion */
668       m = bd->i[i+1] - bd->i[i];
669       while (m > 0) {
670         /* is it above diagonal (in bd (compressed) numbering) */
671         if (garray[*b_sendj] > A->rmap->rstart + i) break;
672         sendbuf[cnt++] = *b_sendbuf++;
673         m--;
674         b_sendj++;
675       }
676 
677       /* put in diagonal portion */
678       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
679         sendbuf[cnt++] = *a_sendbuf++;
680       }
681 
682       /* put in upper diagonal portion */
683       while (m-- > 0) {
684         sendbuf[cnt++] = *b_sendbuf++;
685         b_sendj++;
686       }
687     }
688     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
689 
690     /* -----------------------------------------------------------------
691        Gather all numerical values to all processors
692     */
693     if (!recvcounts) {
694       ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
695     }
696     for (i=0; i<size; i++) {
697       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
698     }
699     displs[0] = 0;
700     for (i=1; i<size; i++) {
701       displs[i] = displs[i-1] + recvcounts[i-1];
702     }
703     recvbuf = b->a;
704 #if defined(PETSC_HAVE_MPI_IN_PLACE)
705     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
706 #else
707     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
708 #endif
709   }  /* endof (flag == MAT_GET_VALUES) */
710   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
711 
712   if (A->symmetric) {
713     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
714   } else if (A->hermitian) {
715     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
716   } else if (A->structurally_symmetric) {
717     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
718   }
719   PetscFunctionReturn(0);
720 }
721 
722 
723 
724 #undef __FUNCT__
725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
727 {
728   PetscErrorCode ierr;
729   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
730   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
731 
732   PetscFunctionBegin;
733   /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
734   /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level.
735      However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that
736      take care to order the result correctly by assembling it with MatSetValues() (after preallocating).
737    */
738   for (i = 0; i < ismax; ++i) {
739     PetscBool sorted;
740     ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr);
741     if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i);
742   }
743 
744   /*
745        Check for special case: each processor gets entire matrix
746   */
747   if (ismax == 1 && C->rmap->N == C->cmap->N) {
748     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
749     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
750     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
751     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
752     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
753       wantallmatrix = PETSC_TRUE;
754 
755       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
756     }
757   }
758   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
759   if (twantallmatrix) {
760     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
761     PetscFunctionReturn(0);
762   }
763 
764   /* Allocate memory to hold all the submatrices */
765   if (scall != MAT_REUSE_MATRIX) {
766     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
767   }
768 
769   /* Check for special case: each processor gets entire matrix columns */
770   ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr);
771   for (i=0; i<ismax; i++) {
772     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
773     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
774     if (colflag && ncol == C->cmap->N) {
775       allcolumns[i] = PETSC_TRUE;
776     } else {
777       allcolumns[i] = PETSC_FALSE;
778     }
779   }
780 
781   /* Determine the number of stages through which submatrices are done */
782   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
783 
784   /*
785      Each stage will extract nmax submatrices.
786      nmax is determined by the matrix column dimension.
787      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
788   */
789   if (!nmax) nmax = 1;
790   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
791 
792   /* Make sure every processor loops through the nstages */
793   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
794 
795   for (i=0,pos=0; i<nstages; i++) {
796     if (pos+nmax <= ismax) max_no = nmax;
797     else if (pos == ismax) max_no = 0;
798     else                   max_no = ismax-pos;
799     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
800     pos += max_no;
801   }
802 
803   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
804   PetscFunctionReturn(0);
805 }
806 
807 /* -------------------------------------------------------------------------*/
808 #undef __FUNCT__
809 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
810 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
811 {
812   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
813   Mat            A  = c->A;
814   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
815   const PetscInt **icol,**irow;
816   PetscInt       *nrow,*ncol,start;
817   PetscErrorCode ierr;
818   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
819   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
820   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
821   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
822   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
823 #if defined(PETSC_USE_CTABLE)
824   PetscTable     *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i;
825 #else
826   PetscInt       **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i;
827 #endif
828   const PetscInt *irow_i;
829   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
830   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
831   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
832   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
833   MPI_Status     *r_status3,*r_status4,*s_status4;
834   MPI_Comm       comm;
835   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
836   PetscMPIInt    *onodes1,*olengths1;
837   PetscMPIInt    idex,idex2,end;
838 
839   PetscFunctionBegin;
840   comm = ((PetscObject)C)->comm;
841   tag0 = ((PetscObject)C)->tag;
842   size = c->size;
843   rank = c->rank;
844 
845   /* Get some new tags to keep the communication clean */
846   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
847   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
848   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
849 
850   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
851 
852   for (i=0; i<ismax; i++) {
853     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
854     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
855     if (allcolumns[i]) {
856       icol[i] = PETSC_NULL;
857       ncol[i] = C->cmap->N;
858     } else {
859       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
860       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
861     }
862   }
863 
864   /* evaluate communication - mesg to who, length of mesg, and buffer space
865      required. Based on this, buffers are allocated, and data copied into them*/
866   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);   /* mesg size */
867   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
868   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
869   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
870   for (i=0; i<ismax; i++) {
871     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
872     jmax   = nrow[i];
873     irow_i = irow[i];
874     for (j=0; j<jmax; j++) {
875       l   = 0;
876       row = irow_i[j];
877       while (row >= C->rmap->range[l+1]) l++;
878       proc = l;
879       w4[proc]++;
880     }
881     for (j=0; j<size; j++) {
882       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
883     }
884   }
885 
886   nrqs     = 0;              /* no of outgoing messages */
887   msz      = 0;              /* total mesg length (for all procs) */
888   w1[rank] = 0;              /* no mesg sent to self */
889   w3[rank] = 0;
890   for (i=0; i<size; i++) {
891     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
892   }
893   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
894   for (i=0,j=0; i<size; i++) {
895     if (w1[i]) { pa[j] = i; j++; }
896   }
897 
898   /* Each message would have a header = 1 + 2*(no of IS) + data */
899   for (i=0; i<nrqs; i++) {
900     j      = pa[i];
901     w1[j] += w2[j] + 2* w3[j];
902     msz   += w1[j];
903   }
904   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
905 
906   /* Determine the number of messages to expect, their lengths, from from-ids */
907   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
908   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
909 
910   /* Now post the Irecvs corresponding to these messages */
911   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
912 
913   ierr = PetscFree(onodes1);CHKERRQ(ierr);
914   ierr = PetscFree(olengths1);CHKERRQ(ierr);
915 
916   /* Allocate Memory for outgoing messages */
917   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
918   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
919   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
920 
921   {
922     PetscInt *iptr = tmp,ict = 0;
923     for (i=0; i<nrqs; i++) {
924       j        = pa[i];
925       iptr    += ict;
926       sbuf1[j] = iptr;
927       ict      = w1[j];
928     }
929   }
930 
931   /* Form the outgoing messages */
932   /* Initialize the header space */
933   for (i=0; i<nrqs; i++) {
934     j           = pa[i];
935     sbuf1[j][0] = 0;
936     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
937     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
938   }
939 
940   /* Parse the isrow and copy data into outbuf */
941   for (i=0; i<ismax; i++) {
942     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
943     irow_i = irow[i];
944     jmax   = nrow[i];
945     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
946       l   = 0;
947       row = irow_i[j];
948       while (row >= C->rmap->range[l+1]) l++;
949       proc = l;
950       if (proc != rank) { /* copy to the outgoing buf*/
951         ctr[proc]++;
952         *ptr[proc] = row;
953         ptr[proc]++;
954       }
955     }
956     /* Update the headers for the current IS */
957     for (j=0; j<size; j++) { /* Can Optimise this loop too */
958       if ((ctr_j = ctr[j])) {
959         sbuf1_j        = sbuf1[j];
960         k              = ++sbuf1_j[0];
961         sbuf1_j[2*k]   = ctr_j;
962         sbuf1_j[2*k-1] = i;
963       }
964     }
965   }
966 
967   /*  Now  post the sends */
968   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
969   for (i=0; i<nrqs; ++i) {
970     j    = pa[i];
971     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
972   }
973 
974   /* Post Receives to capture the buffer size */
975   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
976   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
977   rbuf2[0] = tmp + msz;
978   for (i=1; i<nrqs; ++i) {
979     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
980   }
981   for (i=0; i<nrqs; ++i) {
982     j    = pa[i];
983     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
984   }
985 
986   /* Send to other procs the buf size they should allocate */
987 
988 
989   /* Receive messages*/
990   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
991   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
992   ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
993   {
994     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
995     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
996     PetscInt   *sbuf2_i;
997 
998     for (i=0; i<nrqr; ++i) {
999       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1000 
1001       req_size[idex] = 0;
1002       rbuf1_i        = rbuf1[idex];
1003       start          = 2*rbuf1_i[0] + 1;
1004       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1005       ierr           = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
1006       sbuf2_i        = sbuf2[idex];
1007       for (j=start; j<end; j++) {
1008         id              = rbuf1_i[j] - rstart;
1009         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1010         sbuf2_i[j]      = ncols;
1011         req_size[idex] += ncols;
1012       }
1013       req_source[idex] = r_status1[i].MPI_SOURCE;
1014       /* form the header */
1015       sbuf2_i[0]   = req_size[idex];
1016       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1017 
1018       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1019     }
1020   }
1021   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1022   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1023 
1024   /*  recv buffer sizes */
1025   /* Receive messages*/
1026 
1027   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1028   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1029   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1030   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1031   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1032 
1033   for (i=0; i<nrqs; ++i) {
1034     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1035     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1036     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1037     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1038     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1039   }
1040   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1041   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1042 
1043   /* Wait on sends1 and sends2 */
1044   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1045   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1046 
1047   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1048   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1049   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1050   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1051   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1052   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1053 
1054   /* Now allocate buffers for a->j, and send them off */
1055   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1056   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1057   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1058   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1059 
1060   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1061   {
1062     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1063     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1064     PetscInt cend = C->cmap->rend;
1065     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1066 
1067     for (i=0; i<nrqr; i++) {
1068       rbuf1_i   = rbuf1[i];
1069       sbuf_aj_i = sbuf_aj[i];
1070       ct1       = 2*rbuf1_i[0] + 1;
1071       ct2       = 0;
1072       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1073         kmax = rbuf1[i][2*j];
1074         for (k=0; k<kmax; k++,ct1++) {
1075           row    = rbuf1_i[ct1] - rstart;
1076           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1077           ncols  = nzA + nzB;
1078           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1079 
1080           /* load the column indices for this row into cols*/
1081           cols = sbuf_aj_i + ct2;
1082 
1083           lwrite = 0;
1084           for (l=0; l<nzB; l++) {
1085             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1086           }
1087           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1088           for (l=0; l<nzB; l++) {
1089             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1090           }
1091 
1092           ct2 += ncols;
1093         }
1094       }
1095       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1096     }
1097   }
1098   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1099   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1100 
1101   /* Allocate buffers for a->a, and send them off */
1102   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1103   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1104   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1105   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1106 
1107   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1108   {
1109     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1110     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1111     PetscInt    cend   = C->cmap->rend;
1112     PetscInt    *b_j   = b->j;
1113     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1114 
1115     for (i=0; i<nrqr; i++) {
1116       rbuf1_i   = rbuf1[i];
1117       sbuf_aa_i = sbuf_aa[i];
1118       ct1       = 2*rbuf1_i[0]+1;
1119       ct2       = 0;
1120       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1121         kmax = rbuf1_i[2*j];
1122         for (k=0; k<kmax; k++,ct1++) {
1123           row    = rbuf1_i[ct1] - rstart;
1124           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1125           ncols  = nzA + nzB;
1126           cworkB = b_j + b_i[row];
1127           vworkA = a_a + a_i[row];
1128           vworkB = b_a + b_i[row];
1129 
1130           /* load the column values for this row into vals*/
1131           vals = sbuf_aa_i+ct2;
1132 
1133           lwrite = 0;
1134           for (l=0; l<nzB; l++) {
1135             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1136           }
1137           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1138           for (l=0; l<nzB; l++) {
1139             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1140           }
1141 
1142           ct2 += ncols;
1143         }
1144       }
1145       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1146     }
1147   }
1148   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1149   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1150   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1151   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1152 
1153   /* Form the matrix */
1154   /* create col map: global col of C -> local col of submatrices */
1155   {
1156     const PetscInt *icol_i;
1157 #if defined(PETSC_USE_CTABLE)
1158     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1159     for (i=0; i<ismax; i++) {
1160       if (!allcolumns[i]) {
1161         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1162 
1163         jmax   = ncol[i];
1164         icol_i = icol[i];
1165         cmap_i = cmap[i];
1166         for (j=0; j<jmax; j++) {
1167           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1168         }
1169       } else {
1170         cmap[i] = PETSC_NULL;
1171       }
1172     }
1173 #else
1174     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1175     for (i=0; i<ismax; i++) {
1176       if (!allcolumns[i]) {
1177         ierr   = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr);
1178         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1179         jmax   = ncol[i];
1180         icol_i = icol[i];
1181         cmap_i = cmap[i];
1182         for (j=0; j<jmax; j++) {
1183           cmap_i[icol_i[j]] = j+1;
1184         }
1185       } else {
1186         cmap[i] = PETSC_NULL;
1187       }
1188     }
1189 #endif
1190   }
1191 
1192   /* Create lens which is required for MatCreate... */
1193   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1194   ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1195   if (ismax) {
1196     ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1197     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1198   }
1199   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1200 
1201   /* Update lens from local data */
1202   for (i=0; i<ismax; i++) {
1203     jmax = nrow[i];
1204     if (!allcolumns[i]) cmap_i = cmap[i];
1205     irow_i = irow[i];
1206     lens_i = lens[i];
1207     for (j=0; j<jmax; j++) {
1208       l   = 0;
1209       row = irow_i[j];
1210       while (row >= C->rmap->range[l+1]) l++;
1211       proc = l;
1212       if (proc == rank) {
1213         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1214         if (!allcolumns[i]) {
1215           for (k=0; k<ncols; k++) {
1216 #if defined(PETSC_USE_CTABLE)
1217             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1218 #else
1219             tcol = cmap_i[cols[k]];
1220 #endif
1221             if (tcol) lens_i[j]++;
1222           }
1223         } else { /* allcolumns */
1224           lens_i[j] = ncols;
1225         }
1226         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1227       }
1228     }
1229   }
1230 
1231   /* Create row map: global row of C -> local row of submatrices */
1232 #if defined(PETSC_USE_CTABLE)
1233   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1234   for (i=0; i<ismax; i++) {
1235     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1236     rmap_i = rmap[i];
1237     irow_i = irow[i];
1238     jmax   = nrow[i];
1239     for (j=0; j<jmax; j++) {
1240       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1241     }
1242   }
1243 #else
1244   ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1245   if (ismax) {
1246     ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1247     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1248   }
1249   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1250   for (i=0; i<ismax; i++) {
1251     rmap_i = rmap[i];
1252     irow_i = irow[i];
1253     jmax   = nrow[i];
1254     for (j=0; j<jmax; j++) {
1255       rmap_i[irow_i[j]] = j;
1256     }
1257   }
1258 #endif
1259 
1260   /* Update lens from offproc data */
1261   {
1262     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1263 
1264     for (tmp2=0; tmp2<nrqs; tmp2++) {
1265       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1266       idex    = pa[idex2];
1267       sbuf1_i = sbuf1[idex];
1268       jmax    = sbuf1_i[0];
1269       ct1     = 2*jmax+1;
1270       ct2     = 0;
1271       rbuf2_i = rbuf2[idex2];
1272       rbuf3_i = rbuf3[idex2];
1273       for (j=1; j<=jmax; j++) {
1274         is_no   = sbuf1_i[2*j-1];
1275         max1    = sbuf1_i[2*j];
1276         lens_i  = lens[is_no];
1277         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1278         rmap_i = rmap[is_no];
1279         for (k=0; k<max1; k++,ct1++) {
1280 #if defined(PETSC_USE_CTABLE)
1281           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1282           row--;
1283           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1284 #else
1285           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1286 #endif
1287           max2 = rbuf2_i[ct1];
1288           for (l=0; l<max2; l++,ct2++) {
1289             if (!allcolumns[is_no]) {
1290 #if defined(PETSC_USE_CTABLE)
1291               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1292 #else
1293               tcol = cmap_i[rbuf3_i[ct2]];
1294 #endif
1295               if (tcol) lens_i[row]++;
1296             } else { /* allcolumns */
1297               lens_i[row]++; /* lens_i[row] += max2 ? */
1298             }
1299           }
1300         }
1301       }
1302     }
1303   }
1304   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1305   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1306   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1307   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1308   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1309 
1310   /* Create the submatrices */
1311   if (scall == MAT_REUSE_MATRIX) {
1312     PetscBool flag;
1313 
1314     /*
1315         Assumes new rows are same length as the old rows,hence bug!
1316     */
1317     for (i=0; i<ismax; i++) {
1318       mat = (Mat_SeqAIJ*)(submats[i]->data);
1319       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1320       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1321       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1322       /* Initial matrix as if empty */
1323       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1324 
1325       submats[i]->factortype = C->factortype;
1326     }
1327   } else {
1328     for (i=0; i<ismax; i++) {
1329       PetscInt rbs,cbs;
1330       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1331       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1332 
1333       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1334       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1335 
1336       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1337       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1338       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1339     }
1340   }
1341 
1342   /* Assemble the matrices */
1343   /* First assemble the local rows */
1344   {
1345     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1346     PetscScalar *imat_a;
1347 
1348     for (i=0; i<ismax; i++) {
1349       mat       = (Mat_SeqAIJ*)submats[i]->data;
1350       imat_ilen = mat->ilen;
1351       imat_j    = mat->j;
1352       imat_i    = mat->i;
1353       imat_a    = mat->a;
1354 
1355       if (!allcolumns[i]) cmap_i = cmap[i];
1356       rmap_i = rmap[i];
1357       irow_i = irow[i];
1358       jmax   = nrow[i];
1359       for (j=0; j<jmax; j++) {
1360         l   = 0;
1361         row = irow_i[j];
1362         while (row >= C->rmap->range[l+1]) l++;
1363         proc = l;
1364         if (proc == rank) {
1365           old_row = row;
1366 #if defined(PETSC_USE_CTABLE)
1367           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1368           row--;
1369 #else
1370           row = rmap_i[row];
1371 #endif
1372           ilen_row = imat_ilen[row];
1373           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1374           mat_i    = imat_i[row];
1375           mat_a    = imat_a + mat_i;
1376           mat_j    = imat_j + mat_i;
1377           if (!allcolumns[i]) {
1378             for (k=0; k<ncols; k++) {
1379 #if defined(PETSC_USE_CTABLE)
1380               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1381 #else
1382               tcol = cmap_i[cols[k]];
1383 #endif
1384               if (tcol) {
1385                 *mat_j++ = tcol - 1;
1386                 *mat_a++ = vals[k];
1387                 ilen_row++;
1388               }
1389             }
1390           } else { /* allcolumns */
1391             for (k=0; k<ncols; k++) {
1392               *mat_j++ = cols[k];  /* global col index! */
1393               *mat_a++ = vals[k];
1394               ilen_row++;
1395             }
1396           }
1397           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1398           imat_ilen[row] = ilen_row;
1399         }
1400       }
1401     }
1402   }
1403 
1404   /*   Now assemble the off proc rows*/
1405   {
1406     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1407     PetscInt    *imat_j,*imat_i;
1408     PetscScalar *imat_a,*rbuf4_i;
1409 
1410     for (tmp2=0; tmp2<nrqs; tmp2++) {
1411       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1412       idex    = pa[idex2];
1413       sbuf1_i = sbuf1[idex];
1414       jmax    = sbuf1_i[0];
1415       ct1     = 2*jmax + 1;
1416       ct2     = 0;
1417       rbuf2_i = rbuf2[idex2];
1418       rbuf3_i = rbuf3[idex2];
1419       rbuf4_i = rbuf4[idex2];
1420       for (j=1; j<=jmax; j++) {
1421         is_no     = sbuf1_i[2*j-1];
1422         rmap_i    = rmap[is_no];
1423         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1424         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1425         imat_ilen = mat->ilen;
1426         imat_j    = mat->j;
1427         imat_i    = mat->i;
1428         imat_a    = mat->a;
1429         max1      = sbuf1_i[2*j];
1430         for (k=0; k<max1; k++,ct1++) {
1431           row = sbuf1_i[ct1];
1432 #if defined(PETSC_USE_CTABLE)
1433           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1434           row--;
1435 #else
1436           row = rmap_i[row];
1437 #endif
1438           ilen  = imat_ilen[row];
1439           mat_i = imat_i[row];
1440           mat_a = imat_a + mat_i;
1441           mat_j = imat_j + mat_i;
1442           max2  = rbuf2_i[ct1];
1443           if (!allcolumns[is_no]) {
1444             for (l=0; l<max2; l++,ct2++) {
1445 
1446 #if defined(PETSC_USE_CTABLE)
1447               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1448 #else
1449               tcol = cmap_i[rbuf3_i[ct2]];
1450 #endif
1451               if (tcol) {
1452                 *mat_j++ = tcol - 1;
1453                 *mat_a++ = rbuf4_i[ct2];
1454                 ilen++;
1455               }
1456             }
1457           } else { /* allcolumns */
1458             for (l=0; l<max2; l++,ct2++) {
1459               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1460               *mat_a++ = rbuf4_i[ct2];
1461               ilen++;
1462             }
1463           }
1464           imat_ilen[row] = ilen;
1465         }
1466       }
1467     }
1468   }
1469 
1470   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1471   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1472   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1473   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1474   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1475 
1476   /* Restore the indices */
1477   for (i=0; i<ismax; i++) {
1478     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1479     if (!allcolumns[i]) {
1480       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1481     }
1482   }
1483 
1484   /* Destroy allocated memory */
1485   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1486   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1487   ierr = PetscFree(pa);CHKERRQ(ierr);
1488 
1489   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1490   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1491   for (i=0; i<nrqr; ++i) {
1492     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1493   }
1494   for (i=0; i<nrqs; ++i) {
1495     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1496     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1497   }
1498 
1499   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1500   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1501   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1502   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1503   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1504   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1505   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1506 
1507 #if defined(PETSC_USE_CTABLE)
1508   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1509 #else
1510   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1511 #endif
1512   ierr = PetscFree(rmap);CHKERRQ(ierr);
1513 
1514   for (i=0; i<ismax; i++) {
1515     if (!allcolumns[i]) {
1516 #if defined(PETSC_USE_CTABLE)
1517       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1518 #else
1519       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1520 #endif
1521     }
1522   }
1523   ierr = PetscFree(cmap);CHKERRQ(ierr);
1524   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1525   ierr = PetscFree(lens);CHKERRQ(ierr);
1526 
1527   for (i=0; i<ismax; i++) {
1528     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1529     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1530   }
1531   PetscFunctionReturn(0);
1532 }
1533 
1534 /*
1535  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1536  Be careful not to destroy them elsewhere.
1537  */
1538 #undef __FUNCT__
1539 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1540 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1541 {
1542   /* If making this function public, change the error returned in this function away from _PLIB. */
1543   PetscErrorCode ierr;
1544   Mat_MPIAIJ     *aij;
1545   PetscBool      seqaij;
1546 
1547   PetscFunctionBegin;
1548   /* Check to make sure the component matrices are compatible with C. */
1549   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1550   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1551   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1552   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1553   if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1554 
1555   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1556   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1557   ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr);
1558   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1559   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1560   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1561   ierr    = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1562   aij    = (Mat_MPIAIJ*)((*C)->data);
1563   aij->A = A;
1564   aij->B = B;
1565   ierr   = PetscLogObjectParent(*C,A);CHKERRQ(ierr);
1566   ierr   = PetscLogObjectParent(*C,B);CHKERRQ(ierr);
1567 
1568   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1569   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 #undef __FUNCT__
1574 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1575 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1576 {
1577   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1578 
1579   PetscFunctionBegin;
1580   PetscValidPointer(A,2);
1581   PetscValidPointer(B,3);
1582   *A = aij->A;
1583   *B = aij->B;
1584   /* Note that we don't incref *A and *B, so be careful! */
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 #undef __FUNCT__
1589 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1590 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1591                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1592                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1593                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1594 {
1595   PetscErrorCode ierr;
1596   PetscMPIInt    size, flag;
1597   PetscInt       i,ii;
1598   PetscInt       ismax_c;
1599 
1600   PetscFunctionBegin;
1601   if (!ismax) PetscFunctionReturn(0);
1602 
1603   for (i = 0, ismax_c = 0; i < ismax; ++i) {
1604     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
1605     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1606     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1607     if (size > 1) ++ismax_c;
1608   }
1609   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1610     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1611   } else { /* if (ismax_c) */
1612     Mat         *A,*B;
1613     IS          *isrow_c, *iscol_c;
1614     PetscMPIInt size;
1615     /*
1616      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1617      array of sequential matrices underlying the resulting parallel matrices.
1618      Which arrays to allocate is based on the value of MatReuse scall.
1619      There are as many diag matrices as there are original index sets.
1620      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1621 
1622      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1623      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1624      will deposite the extracted diag and off-diag parts.
1625      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1626      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1627     */
1628 
1629     /* Parallel matrix array is allocated only if no reuse is taking place. */
1630     if (scall != MAT_REUSE_MATRIX) {
1631       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1632     } else {
1633       ierr = PetscMalloc(ismax*sizeof(Mat), &A);CHKERRQ(ierr);
1634       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B);CHKERRQ(ierr);
1635       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1636       for (i = 0, ii = 0; i < ismax; ++i) {
1637         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1638         if (size > 1) {
1639           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
1640           ++ii;
1641         } else A[i] = (*submat)[i];
1642       }
1643     }
1644     /*
1645      Construct the complements of the iscol ISs for parallel ISs only.
1646      These are used to extract the off-diag portion of the resulting parallel matrix.
1647      The row IS for the off-diag portion is the same as for the diag portion,
1648      so we merely alias the row IS, while skipping those that are sequential.
1649     */
1650     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c);CHKERRQ(ierr);
1651     for (i = 0, ii = 0; i < ismax; ++i) {
1652       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1653       if (size > 1) {
1654         isrow_c[ii] = isrow[i];
1655 
1656         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
1657         ++ii;
1658       }
1659     }
1660     /* Now obtain the sequential A and B submatrices separately. */
1661     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
1662     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
1663     for (ii = 0; ii < ismax_c; ++ii) {
1664       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
1665     }
1666     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
1667     /*
1668      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1669      have been extracted directly into the parallel matrices containing them, or
1670      simply into the sequential matrix identical with the corresponding A (if size == 1).
1671      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1672      A is put into the correct submat slot (if size == 1).
1673      */
1674     if (scall != MAT_REUSE_MATRIX) {
1675       for (i = 0, ii = 0; i < ismax; ++i) {
1676         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1677         if (size > 1) {
1678           /*
1679            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1680            */
1681           /* Construct submat[i] from the Seq pieces A and B. */
1682           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
1683 
1684           ++ii;
1685         } else (*submat)[i] = A[i];
1686       }
1687     }
1688     ierr = PetscFree(A);CHKERRQ(ierr);
1689     ierr = PetscFree(B);CHKERRQ(ierr);
1690   }
1691   PetscFunctionReturn(0);
1692 } /* MatGetSubMatricesParallel_MPIXAIJ() */
1693 
1694 
1695 
1696 #undef __FUNCT__
1697 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1698 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1699 {
1700   PetscErrorCode ierr;
1701 
1702   PetscFunctionBegin;
1703   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
1704   PetscFunctionReturn(0);
1705 }
1706