xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 7d0a6c19129e7069c8a40e210b34ed62989173db)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include "../src/mat/impls/aij/mpi/mpiaij.h"
7 #include "petscbt.h"
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,*rtable,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69 
70   PetscFunctionBegin;
71   comm   = ((PetscObject)C)->comm;
72   size   = c->size;
73   rank   = c->rank;
74   m      = C->rmap->N;
75 
76   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
78 
79   ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr);
80 
81   for (i=0; i<imax; i++) {
82     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
83     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
84   }
85 
86   /* Create hash table for the mapping :row -> proc*/
87   for (i=0,j=0; i<size; i++) {
88     len = C->rmap->range[i+1];
89     for (; j<len; j++) {
90       rtable[j] = i;
91     }
92   }
93 
94   /* evaluate communication - mesg to who,length of mesg, and buffer space
95      required. Based on this, buffers are allocated, and data copied into them*/
96   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
97   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
98   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
99   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
100   for (i=0; i<imax; i++) {
101     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
102     idx_i = idx[i];
103     len   = n[i];
104     for (j=0; j<len; j++) {
105       row  = idx_i[j];
106       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
107       proc = rtable[row];
108       w4[proc]++;
109     }
110     for (j=0; j<size; j++){
111       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
112     }
113   }
114 
115   nrqs     = 0;              /* no of outgoing messages */
116   msz      = 0;              /* total mesg length (for all proc */
117   w1[rank] = 0;              /* no mesg sent to intself */
118   w3[rank] = 0;
119   for (i=0; i<size; i++) {
120     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
121   }
122   /* pa - is list of processors to communicate with */
123   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
124   for (i=0,j=0; i<size; i++) {
125     if (w1[i]) {pa[j] = i; j++;}
126   }
127 
128   /* Each message would have a header = 1 + 2*(no of IS) + data */
129   for (i=0; i<nrqs; i++) {
130     j      = pa[i];
131     w1[j] += w2[j] + 2*w3[j];
132     msz   += w1[j];
133   }
134 
135   /* Determine the number of messages to expect, their lengths, from from-ids */
136   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
137   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
138 
139   /* Now post the Irecvs corresponding to these messages */
140   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
141 
142   /* Allocate Memory for outgoing messages */
143   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
144   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
145   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
146 
147   {
148     PetscInt *iptr = tmp,ict  = 0;
149     for (i=0; i<nrqs; i++) {
150       j         = pa[i];
151       iptr     +=  ict;
152       outdat[j] = iptr;
153       ict       = w1[j];
154     }
155   }
156 
157   /* Form the outgoing messages */
158   /*plug in the headers*/
159   for (i=0; i<nrqs; i++) {
160     j            = pa[i];
161     outdat[j][0] = 0;
162     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
163     ptr[j]       = outdat[j] + 2*w3[j] + 1;
164   }
165 
166   /* Memory for doing local proc's work*/
167   {
168     PetscInt  *d_p;
169     char      *t_p;
170 
171     /* should replace with PetscMallocN() */
172     ierr  = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
173       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr);
174     ierr  = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
175       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr);
176     data  = (PetscInt **)(table + imax);
177     isz   = (PetscInt  *)(data  + imax);
178     d_p   = (PetscInt  *)(isz   + imax);
179     t_p   = (char *)(d_p   + m*imax);
180     for (i=0; i<imax; i++) {
181       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
182       data[i]  = d_p + (m)*i;
183     }
184   }
185 
186   /* Parse the IS and update local tables and the outgoing buf with the data*/
187   {
188     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
189     PetscBT table_i;
190 
191     for (i=0; i<imax; i++) {
192       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
193       n_i     = n[i];
194       table_i = table[i];
195       idx_i   = idx[i];
196       data_i  = data[i];
197       isz_i   = isz[i];
198       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
199         row  = idx_i[j];
200         proc = rtable[row];
201         if (proc != rank) { /* copy to the outgoing buffer */
202           ctr[proc]++;
203           *ptr[proc] = row;
204           ptr[proc]++;
205         } else { /* Update the local table */
206           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
207         }
208       }
209       /* Update the headers for the current IS */
210       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
211         if ((ctr_j = ctr[j])) {
212           outdat_j        = outdat[j];
213           k               = ++outdat_j[0];
214           outdat_j[2*k]   = ctr_j;
215           outdat_j[2*k-1] = i;
216         }
217       }
218       isz[i] = isz_i;
219     }
220   }
221 
222   /*  Now  post the sends */
223   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
224   for (i=0; i<nrqs; ++i) {
225     j    = pa[i];
226     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
227   }
228 
229   /* No longer need the original indices*/
230   for (i=0; i<imax; ++i) {
231     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
232   }
233   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
234 
235   for (i=0; i<imax; ++i) {
236     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
237   }
238 
239   /* Do Local work*/
240   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
241 
242   /* Receive messages*/
243   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
244   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
245 
246   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
247   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
248 
249   /* Phase 1 sends are complete - deallocate buffers */
250   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
251   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
252 
253   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
254   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
255   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
256   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
257   ierr = PetscFree(rbuf);CHKERRQ(ierr);
258 
259 
260  /* Send the data back*/
261   /* Do a global reduction to know the buffer space req for incoming messages*/
262   {
263     PetscMPIInt *rw1;
264 
265     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
266     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
267 
268     for (i=0; i<nrqr; ++i) {
269       proc      = recv_status[i].MPI_SOURCE;
270       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
271       rw1[proc] = isz1[i];
272     }
273     ierr = PetscFree(onodes1);CHKERRQ(ierr);
274     ierr = PetscFree(olengths1);CHKERRQ(ierr);
275 
276     /* Determine the number of messages to expect, their lengths, from from-ids */
277     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
278     ierr = PetscFree(rw1);CHKERRQ(ierr);
279   }
280   /* Now post the Irecvs corresponding to these messages */
281   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
282 
283   /*  Now  post the sends */
284   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
285   for (i=0; i<nrqr; ++i) {
286     j    = recv_status[i].MPI_SOURCE;
287     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
288   }
289 
290   /* receive work done on other processors*/
291   {
292     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
293     PetscMPIInt idex;
294     PetscBT     table_i;
295     MPI_Status  *status2;
296 
297     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
298     for (i=0; i<nrqs; ++i) {
299       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
300       /* Process the message*/
301       rbuf2_i = rbuf2[idex];
302       ct1     = 2*rbuf2_i[0]+1;
303       jmax    = rbuf2[idex][0];
304       for (j=1; j<=jmax; j++) {
305         max     = rbuf2_i[2*j];
306         is_no   = rbuf2_i[2*j-1];
307         isz_i   = isz[is_no];
308         data_i  = data[is_no];
309         table_i = table[is_no];
310         for (k=0; k<max; k++,ct1++) {
311           row = rbuf2_i[ct1];
312           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
313         }
314         isz[is_no] = isz_i;
315       }
316     }
317 
318     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
319     ierr = PetscFree(status2);CHKERRQ(ierr);
320   }
321 
322   for (i=0; i<imax; ++i) {
323     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
324   }
325 
326   ierr = PetscFree(onodes2);CHKERRQ(ierr);
327   ierr = PetscFree(olengths2);CHKERRQ(ierr);
328 
329   ierr = PetscFree(pa);CHKERRQ(ierr);
330   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
331   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
332   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
333   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
334   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
335   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
336   ierr = PetscFree(table);CHKERRQ(ierr);
337   ierr = PetscFree(s_status);CHKERRQ(ierr);
338   ierr = PetscFree(recv_status);CHKERRQ(ierr);
339   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
340   ierr = PetscFree(xdata);CHKERRQ(ierr);
341   ierr = PetscFree(isz1);CHKERRQ(ierr);
342   PetscFunctionReturn(0);
343 }
344 
345 #undef __FUNCT__
346 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
347 /*
348    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
349        the work on the local processor.
350 
351      Inputs:
352       C      - MAT_MPIAIJ;
353       imax - total no of index sets processed at a time;
354       table  - an array of char - size = m bits.
355 
356      Output:
357       isz    - array containing the count of the solution elements correspondign
358                to each index set;
359       data   - pointer to the solutions
360 */
361 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
362 {
363   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
364   Mat        A = c->A,B = c->B;
365   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
366   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
367   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
368   PetscBT    table_i;
369 
370   PetscFunctionBegin;
371   rstart = C->rmap->rstart;
372   cstart = C->cmap->rstart;
373   ai     = a->i;
374   aj     = a->j;
375   bi     = b->i;
376   bj     = b->j;
377   garray = c->garray;
378 
379 
380   for (i=0; i<imax; i++) {
381     data_i  = data[i];
382     table_i = table[i];
383     isz_i   = isz[i];
384     for (j=0,max=isz[i]; j<max; j++) {
385       row   = data_i[j] - rstart;
386       start = ai[row];
387       end   = ai[row+1];
388       for (k=start; k<end; k++) { /* Amat */
389         val = aj[k] + cstart;
390         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
391       }
392       start = bi[row];
393       end   = bi[row+1];
394       for (k=start; k<end; k++) { /* Bmat */
395         val = garray[bj[k]];
396         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
397       }
398     }
399     isz[i] = isz_i;
400   }
401   PetscFunctionReturn(0);
402 }
403 
404 #undef __FUNCT__
405 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
406 /*
407       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
408          and return the output
409 
410          Input:
411            C    - the matrix
412            nrqr - no of messages being processed.
413            rbuf - an array of pointers to the recieved requests
414 
415          Output:
416            xdata - array of messages to be sent back
417            isz1  - size of each message
418 
419   For better efficiency perhaps we should malloc separately each xdata[i],
420 then if a remalloc is required we need only copy the data for that one row
421 rather then all previous rows as it is now where a single large chunck of
422 memory is used.
423 
424 */
425 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
426 {
427   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
428   Mat            A = c->A,B = c->B;
429   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
430   PetscErrorCode ierr;
431   PetscMPIInt    rank;
432   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
433   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
434   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
435   PetscInt       *rbuf_i,kmax,rbuf_0;
436   PetscBT        xtable;
437 
438   PetscFunctionBegin;
439   rank   = c->rank;
440   m      = C->rmap->N;
441   rstart = C->rmap->rstart;
442   cstart = C->cmap->rstart;
443   ai     = a->i;
444   aj     = a->j;
445   bi     = b->i;
446   bj     = b->j;
447   garray = c->garray;
448 
449 
450   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
451     rbuf_i  =  rbuf[i];
452     rbuf_0  =  rbuf_i[0];
453     ct     += rbuf_0;
454     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
455   }
456 
457   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
458   else      max1 = 1;
459   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
460   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
461   ++no_malloc;
462   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
463   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
464 
465   ct3 = 0;
466   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
467     rbuf_i =  rbuf[i];
468     rbuf_0 =  rbuf_i[0];
469     ct1    =  2*rbuf_0+1;
470     ct2    =  ct1;
471     ct3    += ct1;
472     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
473       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
474       oct2 = ct2;
475       kmax = rbuf_i[2*j];
476       for (k=0; k<kmax; k++,ct1++) {
477         row = rbuf_i[ct1];
478         if (!PetscBTLookupSet(xtable,row)) {
479           if (!(ct3 < mem_estimate)) {
480             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
481             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
482             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
483             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
484             xdata[0]     = tmp;
485             mem_estimate = new_estimate; ++no_malloc;
486             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
487           }
488           xdata[i][ct2++] = row;
489           ct3++;
490         }
491       }
492       for (k=oct2,max2=ct2; k<max2; k++) {
493         row   = xdata[i][k] - rstart;
494         start = ai[row];
495         end   = ai[row+1];
496         for (l=start; l<end; l++) {
497           val = aj[l] + cstart;
498           if (!PetscBTLookupSet(xtable,val)) {
499             if (!(ct3 < mem_estimate)) {
500               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
501               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
502               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
503               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
504               xdata[0]     = tmp;
505               mem_estimate = new_estimate; ++no_malloc;
506               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
507             }
508             xdata[i][ct2++] = val;
509             ct3++;
510           }
511         }
512         start = bi[row];
513         end   = bi[row+1];
514         for (l=start; l<end; l++) {
515           val = garray[bj[l]];
516           if (!PetscBTLookupSet(xtable,val)) {
517             if (!(ct3 < mem_estimate)) {
518               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
519               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
520               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
521               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
522               xdata[0]     = tmp;
523               mem_estimate = new_estimate; ++no_malloc;
524               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
525             }
526             xdata[i][ct2++] = val;
527             ct3++;
528           }
529         }
530       }
531       /* Update the header*/
532       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
533       xdata[i][2*j-1] = rbuf_i[2*j-1];
534     }
535     xdata[i][0] = rbuf_0;
536     xdata[i+1]  = xdata[i] + ct2;
537     isz1[i]     = ct2; /* size of each message */
538   }
539   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
540   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
541   PetscFunctionReturn(0);
542 }
543 /* -------------------------------------------------------------------------*/
544 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
545 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
546 /*
547     Every processor gets the entire matrix
548 */
549 #undef __FUNCT__
550 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
551 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
552 {
553   Mat            B;
554   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
555   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
556   PetscErrorCode ierr;
557   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
558   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
559   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
560   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
561 
562   PetscFunctionBegin;
563   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
564   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
565 
566   if (scall == MAT_INITIAL_MATRIX) {
567     /* ----------------------------------------------------------------
568          Tell every processor the number of nonzeros per row
569     */
570     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
571     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
572       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
573     }
574     sendcount = A->rmap->rend - A->rmap->rstart;
575     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
576     for (i=0; i<size; i++) {
577       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
578       displs[i]     = A->rmap->range[i];
579     }
580 #if defined(PETSC_HAVE_MPI_IN_PLACE)
581     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
582 #else
583     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
584 #endif
585     /* ---------------------------------------------------------------
586          Create the sequential matrix of the same type as the local block diagonal
587     */
588     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
589     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
590     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
591     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
592     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
593     **Bin = B;
594     b = (Mat_SeqAIJ *)B->data;
595 
596     /*--------------------------------------------------------------------
597        Copy my part of matrix column indices over
598     */
599     sendcount  = ad->nz + bd->nz;
600     jsendbuf   = b->j + b->i[rstarts[rank]];
601     a_jsendbuf = ad->j;
602     b_jsendbuf = bd->j;
603     n          = A->rmap->rend - A->rmap->rstart;
604     cnt        = 0;
605     for (i=0; i<n; i++) {
606 
607       /* put in lower diagonal portion */
608       m = bd->i[i+1] - bd->i[i];
609       while (m > 0) {
610         /* is it above diagonal (in bd (compressed) numbering) */
611         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
612         jsendbuf[cnt++] = garray[*b_jsendbuf++];
613         m--;
614       }
615 
616       /* put in diagonal portion */
617       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
618         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
619       }
620 
621       /* put in upper diagonal portion */
622       while (m-- > 0) {
623         jsendbuf[cnt++] = garray[*b_jsendbuf++];
624       }
625     }
626     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
627 
628     /*--------------------------------------------------------------------
629        Gather all column indices to all processors
630     */
631     for (i=0; i<size; i++) {
632       recvcounts[i] = 0;
633       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
634         recvcounts[i] += lens[j];
635       }
636     }
637     displs[0]  = 0;
638     for (i=1; i<size; i++) {
639       displs[i] = displs[i-1] + recvcounts[i-1];
640     }
641 #if defined(PETSC_HAVE_MPI_IN_PLACE)
642     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
643 #else
644     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
645 #endif
646     /*--------------------------------------------------------------------
647         Assemble the matrix into useable form (note numerical values not yet set)
648     */
649     /* set the b->ilen (length of each row) values */
650     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
651     /* set the b->i indices */
652     b->i[0] = 0;
653     for (i=1; i<=A->rmap->N; i++) {
654       b->i[i] = b->i[i-1] + lens[i-1];
655     }
656     ierr = PetscFree(lens);CHKERRQ(ierr);
657     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
658     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
659 
660   } else {
661     B  = **Bin;
662     b = (Mat_SeqAIJ *)B->data;
663   }
664 
665   /*--------------------------------------------------------------------
666        Copy my part of matrix numerical values into the values location
667   */
668   if (flag == MAT_GET_VALUES){
669     sendcount = ad->nz + bd->nz;
670     sendbuf   = b->a + b->i[rstarts[rank]];
671     a_sendbuf = ad->a;
672     b_sendbuf = bd->a;
673     b_sendj   = bd->j;
674     n         = A->rmap->rend - A->rmap->rstart;
675     cnt       = 0;
676     for (i=0; i<n; i++) {
677 
678       /* put in lower diagonal portion */
679       m = bd->i[i+1] - bd->i[i];
680       while (m > 0) {
681         /* is it above diagonal (in bd (compressed) numbering) */
682         if (garray[*b_sendj] > A->rmap->rstart + i) break;
683         sendbuf[cnt++] = *b_sendbuf++;
684         m--;
685         b_sendj++;
686       }
687 
688       /* put in diagonal portion */
689       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
690         sendbuf[cnt++] = *a_sendbuf++;
691       }
692 
693       /* put in upper diagonal portion */
694       while (m-- > 0) {
695         sendbuf[cnt++] = *b_sendbuf++;
696         b_sendj++;
697       }
698     }
699     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
700 
701     /* -----------------------------------------------------------------
702        Gather all numerical values to all processors
703     */
704     if (!recvcounts) {
705       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
706     }
707     for (i=0; i<size; i++) {
708       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
709     }
710     displs[0]  = 0;
711     for (i=1; i<size; i++) {
712       displs[i] = displs[i-1] + recvcounts[i-1];
713     }
714     recvbuf   = b->a;
715 #if defined(PETSC_HAVE_MPI_IN_PLACE)
716     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
717 #else
718     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
719 #endif
720   }  /* endof (flag == MAT_GET_VALUES) */
721   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
722 
723   if (A->symmetric){
724     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
725   } else if (A->hermitian) {
726     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
727   } else if (A->structurally_symmetric) {
728     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
729   }
730   PetscFunctionReturn(0);
731 }
732 
733 
734 
735 #undef __FUNCT__
736 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
737 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
738 {
739   PetscErrorCode ierr;
740   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
741   PetscBool      rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
742 
743   PetscFunctionBegin;
744   /*
745        Check for special case each processor gets entire matrix
746   */
747   if (ismax == 1 && C->rmap->N == C->cmap->N) {
748     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
749     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
750     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
751     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
752     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
753       wantallmatrix = PETSC_TRUE;
754       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
755     }
756   }
757   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
758   if (twantallmatrix) {
759     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
760     PetscFunctionReturn(0);
761   }
762 
763   /* Allocate memory to hold all the submatrices */
764   if (scall != MAT_REUSE_MATRIX) {
765     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
766   }
767   /* Determine the number of stages through which submatrices are done */
768   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
769   /*
770      Each stage will extract nmax submatrices.
771      nmax is determined by the matrix column dimension.
772      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
773   */
774   if (!nmax) nmax = 1;
775   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
776 
777   /* Make sure every processor loops through the nstages */
778   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
779 
780   for (i=0,pos=0; i<nstages; i++) {
781     if (pos+nmax <= ismax) max_no = nmax;
782     else if (pos == ismax) max_no = 0;
783     else                   max_no = ismax-pos;
784     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
785     pos += max_no;
786   }
787   PetscFunctionReturn(0);
788 }
789 
790 
791 
792 
793 /* -------------------------------------------------------------------------*/
794 #undef __FUNCT__
795 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
796 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
797 {
798   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
799   Mat            A = c->A;
800   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
801   const PetscInt **icol,**irow;
802   PetscInt       *nrow,*ncol,start;
803   PetscErrorCode ierr;
804   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
805   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
806   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
807   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
808   PetscInt       **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
809   const PetscInt *irow_i;
810   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
811   PetscInt       *rmap_i;
812   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
813   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
814   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
815   MPI_Status     *r_status3,*r_status4,*s_status4;
816   MPI_Comm       comm;
817   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
818   PetscBool      sorted;
819   PetscMPIInt    *onodes1,*olengths1;
820   PetscMPIInt    idex,idex2,end;
821 
822   PetscFunctionBegin;
823   comm   = ((PetscObject)C)->comm;
824   tag0   = ((PetscObject)C)->tag;
825   size   = c->size;
826   rank   = c->rank;
827 
828   /* Get some new tags to keep the communication clean */
829   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
830   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
831   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
832 
833     /* Check if the col indices are sorted */
834   for (i=0; i<ismax; i++) {
835     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
836     /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
837     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
838     /*    if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
839   }
840   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
841 
842   for (i=0; i<ismax; i++) {
843     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
844     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
845     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
846     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
847   }
848 
849   /* evaluate communication - mesg to who, length of mesg, and buffer space
850      required. Based on this, buffers are allocated, and data copied into them*/
851   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
852   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
853   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
854   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
855   for (i=0; i<ismax; i++) {
856     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
857     jmax   = nrow[i];
858     irow_i = irow[i];
859     for (j=0; j<jmax; j++) {
860       l = 0;
861       row  = irow_i[j];
862       while (row >= C->rmap->range[l+1]) l++;
863       proc = l;
864       w4[proc]++;
865     }
866     for (j=0; j<size; j++) {
867       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
868     }
869   }
870 
871   nrqs     = 0;              /* no of outgoing messages */
872   msz      = 0;              /* total mesg length (for all procs) */
873   w1[rank] = 0;              /* no mesg sent to self */
874   w3[rank] = 0;
875   for (i=0; i<size; i++) {
876     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
877   }
878   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
879   for (i=0,j=0; i<size; i++) {
880     if (w1[i]) { pa[j] = i; j++; }
881   }
882 
883   /* Each message would have a header = 1 + 2*(no of IS) + data */
884   for (i=0; i<nrqs; i++) {
885     j     = pa[i];
886     w1[j] += w2[j] + 2* w3[j];
887     msz   += w1[j];
888   }
889   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
890 
891   /* Determine the number of messages to expect, their lengths, from from-ids */
892   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
893   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
894 
895   /* Now post the Irecvs corresponding to these messages */
896   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
897 
898   ierr = PetscFree(onodes1);CHKERRQ(ierr);
899   ierr = PetscFree(olengths1);CHKERRQ(ierr);
900 
901   /* Allocate Memory for outgoing messages */
902   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
903   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
904   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
905 
906   {
907     PetscInt *iptr = tmp,ict = 0;
908     for (i=0; i<nrqs; i++) {
909       j         = pa[i];
910       iptr     += ict;
911       sbuf1[j]  = iptr;
912       ict       = w1[j];
913     }
914   }
915 
916   /* Form the outgoing messages */
917   /* Initialize the header space */
918   for (i=0; i<nrqs; i++) {
919     j           = pa[i];
920     sbuf1[j][0] = 0;
921     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
922     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
923   }
924 
925   /* Parse the isrow and copy data into outbuf */
926   for (i=0; i<ismax; i++) {
927     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
928     irow_i = irow[i];
929     jmax   = nrow[i];
930     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
931       l = 0;
932       row  = irow_i[j];
933       while (row >= C->rmap->range[l+1]) l++;
934       proc = l;
935       if (proc != rank) { /* copy to the outgoing buf*/
936         ctr[proc]++;
937         *ptr[proc] = row;
938         ptr[proc]++;
939       }
940     }
941     /* Update the headers for the current IS */
942     for (j=0; j<size; j++) { /* Can Optimise this loop too */
943       if ((ctr_j = ctr[j])) {
944         sbuf1_j        = sbuf1[j];
945         k              = ++sbuf1_j[0];
946         sbuf1_j[2*k]   = ctr_j;
947         sbuf1_j[2*k-1] = i;
948       }
949     }
950   }
951 
952   /*  Now  post the sends */
953   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
954   for (i=0; i<nrqs; ++i) {
955     j    = pa[i];
956     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
957   }
958 
959   /* Post Receives to capture the buffer size */
960   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
961   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
962   rbuf2[0] = tmp + msz;
963   for (i=1; i<nrqs; ++i) {
964     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
965   }
966   for (i=0; i<nrqs; ++i) {
967     j    = pa[i];
968     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
969   }
970 
971   /* Send to other procs the buf size they should allocate */
972 
973 
974   /* Receive messages*/
975   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
976   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
977   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
978   {
979     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
980     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
981     PetscInt    *sbuf2_i;
982 
983     for (i=0; i<nrqr; ++i) {
984       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
985       req_size[idex] = 0;
986       rbuf1_i         = rbuf1[idex];
987       start           = 2*rbuf1_i[0] + 1;
988       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
989       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
990       sbuf2_i         = sbuf2[idex];
991       for (j=start; j<end; j++) {
992         id               = rbuf1_i[j] - rstart;
993         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
994         sbuf2_i[j]       = ncols;
995         req_size[idex] += ncols;
996       }
997       req_source[idex] = r_status1[i].MPI_SOURCE;
998       /* form the header */
999       sbuf2_i[0]   = req_size[idex];
1000       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1001       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1002     }
1003   }
1004   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1005   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1006 
1007   /*  recv buffer sizes */
1008   /* Receive messages*/
1009 
1010   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1011   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1012   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1013   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1014   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1015 
1016   for (i=0; i<nrqs; ++i) {
1017     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1018     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1019     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1020     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1021     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1022   }
1023   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1024   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1025 
1026   /* Wait on sends1 and sends2 */
1027   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1028   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1029 
1030   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1031   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1032   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1033   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1034   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1035   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1036 
1037   /* Now allocate buffers for a->j, and send them off */
1038   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1039   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1040   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1041   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1042 
1043   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1044   {
1045     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1046     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1047     PetscInt cend = C->cmap->rend;
1048     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1049 
1050     for (i=0; i<nrqr; i++) {
1051       rbuf1_i   = rbuf1[i];
1052       sbuf_aj_i = sbuf_aj[i];
1053       ct1       = 2*rbuf1_i[0] + 1;
1054       ct2       = 0;
1055       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1056         kmax = rbuf1[i][2*j];
1057         for (k=0; k<kmax; k++,ct1++) {
1058           row    = rbuf1_i[ct1] - rstart;
1059           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1060           ncols  = nzA + nzB;
1061           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1062 
1063           /* load the column indices for this row into cols*/
1064           cols  = sbuf_aj_i + ct2;
1065 
1066 	  lwrite = 0;
1067           for (l=0; l<nzB; l++) {
1068             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1069           }
1070           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1071           for (l=0; l<nzB; l++) {
1072             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1073           }
1074 
1075           ct2 += ncols;
1076         }
1077       }
1078       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1079     }
1080   }
1081   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1082   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1083 
1084   /* Allocate buffers for a->a, and send them off */
1085   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1086   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1087   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1088   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1089 
1090   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1091   {
1092     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1093     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1094     PetscInt    cend = C->cmap->rend;
1095     PetscInt    *b_j = b->j;
1096     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1097 
1098     for (i=0; i<nrqr; i++) {
1099       rbuf1_i   = rbuf1[i];
1100       sbuf_aa_i = sbuf_aa[i];
1101       ct1       = 2*rbuf1_i[0]+1;
1102       ct2       = 0;
1103       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1104         kmax = rbuf1_i[2*j];
1105         for (k=0; k<kmax; k++,ct1++) {
1106           row    = rbuf1_i[ct1] - rstart;
1107           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1108           ncols  = nzA + nzB;
1109           cworkB = b_j + b_i[row];
1110           vworkA = a_a + a_i[row];
1111           vworkB = b_a + b_i[row];
1112 
1113           /* load the column values for this row into vals*/
1114           vals  = sbuf_aa_i+ct2;
1115 
1116 	  lwrite = 0;
1117           for (l=0; l<nzB; l++) {
1118             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1119           }
1120           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1121           for (l=0; l<nzB; l++) {
1122             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1123           }
1124 
1125           ct2 += ncols;
1126         }
1127       }
1128       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1129     }
1130   }
1131   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1132   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1133   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1134   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1135 
1136   /* Form the matrix */
1137   /* create col map */
1138   {
1139     const PetscInt *icol_i;
1140 
1141     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1142     ierr    = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1143     ierr    = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1144     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1145     for (i=0; i<ismax; i++) {
1146       jmax   = ncol[i];
1147       icol_i = icol[i];
1148       cmap_i = cmap[i];
1149       for (j=0; j<jmax; j++) {
1150         cmap_i[icol_i[j]] = j+1;
1151       }
1152     }
1153   }
1154 
1155   /* Create lens which is required for MatCreate... */
1156   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1157   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1158   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1159   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1160   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1161 
1162   /* Update lens from local data */
1163   for (i=0; i<ismax; i++) {
1164     jmax   = nrow[i];
1165     cmap_i = cmap[i];
1166     irow_i = irow[i];
1167     lens_i = lens[i];
1168     for (j=0; j<jmax; j++) {
1169       l = 0;
1170       row  = irow_i[j];
1171       while (row >= C->rmap->range[l+1]) l++;
1172       proc = l;
1173       if (proc == rank) {
1174         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1175         for (k=0; k<ncols; k++) {
1176           if (cmap_i[cols[k]]) { lens_i[j]++;}
1177         }
1178         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1179       }
1180     }
1181   }
1182 
1183   /* Create row map*/
1184   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1185   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1186   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1187   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1188   for (i=0; i<ismax; i++) {
1189     rmap_i = rmap[i];
1190     irow_i = irow[i];
1191     jmax   = nrow[i];
1192     for (j=0; j<jmax; j++) {
1193       rmap_i[irow_i[j]] = j;
1194     }
1195   }
1196 
1197   /* Update lens from offproc data */
1198   {
1199     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1200 
1201     for (tmp2=0; tmp2<nrqs; tmp2++) {
1202       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1203       idex   = pa[idex2];
1204       sbuf1_i = sbuf1[idex];
1205       jmax    = sbuf1_i[0];
1206       ct1     = 2*jmax+1;
1207       ct2     = 0;
1208       rbuf2_i = rbuf2[idex2];
1209       rbuf3_i = rbuf3[idex2];
1210       for (j=1; j<=jmax; j++) {
1211         is_no   = sbuf1_i[2*j-1];
1212         max1    = sbuf1_i[2*j];
1213         lens_i  = lens[is_no];
1214         cmap_i  = cmap[is_no];
1215         rmap_i  = rmap[is_no];
1216         for (k=0; k<max1; k++,ct1++) {
1217           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1218           max2 = rbuf2_i[ct1];
1219           for (l=0; l<max2; l++,ct2++) {
1220             if (cmap_i[rbuf3_i[ct2]]) {
1221               lens_i[row]++;
1222             }
1223           }
1224         }
1225       }
1226     }
1227   }
1228   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1229   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1230   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1231   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1232   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1233 
1234   /* Create the submatrices */
1235   if (scall == MAT_REUSE_MATRIX) {
1236     PetscBool  flag;
1237 
1238     /*
1239         Assumes new rows are same length as the old rows,hence bug!
1240     */
1241     for (i=0; i<ismax; i++) {
1242       mat = (Mat_SeqAIJ *)(submats[i]->data);
1243       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1244       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1245       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1246       /* Initial matrix as if empty */
1247       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1248       submats[i]->factortype = C->factortype;
1249     }
1250   } else {
1251     for (i=0; i<ismax; i++) {
1252       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1253       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1254       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1255       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1256     }
1257   }
1258 
1259   /* Assemble the matrices */
1260   /* First assemble the local rows */
1261   {
1262     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1263     PetscScalar *imat_a;
1264 
1265     for (i=0; i<ismax; i++) {
1266       mat       = (Mat_SeqAIJ*)submats[i]->data;
1267       imat_ilen = mat->ilen;
1268       imat_j    = mat->j;
1269       imat_i    = mat->i;
1270       imat_a    = mat->a;
1271       cmap_i    = cmap[i];
1272       rmap_i    = rmap[i];
1273       irow_i    = irow[i];
1274       jmax      = nrow[i];
1275       for (j=0; j<jmax; j++) {
1276 	l = 0;
1277         row      = irow_i[j];
1278         while (row >= C->rmap->range[l+1]) l++;
1279         proc = l;
1280         if (proc == rank) {
1281           old_row  = row;
1282           row      = rmap_i[row];
1283           ilen_row = imat_ilen[row];
1284           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1285           mat_i    = imat_i[row] ;
1286           mat_a    = imat_a + mat_i;
1287           mat_j    = imat_j + mat_i;
1288           for (k=0; k<ncols; k++) {
1289             if ((tcol = cmap_i[cols[k]])) {
1290               *mat_j++ = tcol - 1;
1291               *mat_a++ = vals[k];
1292               ilen_row++;
1293             }
1294           }
1295           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1296           imat_ilen[row] = ilen_row;
1297         }
1298       }
1299     }
1300   }
1301 
1302   /*   Now assemble the off proc rows*/
1303   {
1304     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1305     PetscInt    *imat_j,*imat_i;
1306     PetscScalar *imat_a,*rbuf4_i;
1307 
1308     for (tmp2=0; tmp2<nrqs; tmp2++) {
1309       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1310       idex   = pa[idex2];
1311       sbuf1_i = sbuf1[idex];
1312       jmax    = sbuf1_i[0];
1313       ct1     = 2*jmax + 1;
1314       ct2     = 0;
1315       rbuf2_i = rbuf2[idex2];
1316       rbuf3_i = rbuf3[idex2];
1317       rbuf4_i = rbuf4[idex2];
1318       for (j=1; j<=jmax; j++) {
1319         is_no     = sbuf1_i[2*j-1];
1320         rmap_i    = rmap[is_no];
1321         cmap_i    = cmap[is_no];
1322         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1323         imat_ilen = mat->ilen;
1324         imat_j    = mat->j;
1325         imat_i    = mat->i;
1326         imat_a    = mat->a;
1327         max1      = sbuf1_i[2*j];
1328         for (k=0; k<max1; k++,ct1++) {
1329           row   = sbuf1_i[ct1];
1330           row   = rmap_i[row];
1331           ilen  = imat_ilen[row];
1332           mat_i = imat_i[row] ;
1333           mat_a = imat_a + mat_i;
1334           mat_j = imat_j + mat_i;
1335           max2 = rbuf2_i[ct1];
1336           for (l=0; l<max2; l++,ct2++) {
1337             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1338               *mat_j++ = tcol - 1;
1339               *mat_a++ = rbuf4_i[ct2];
1340               ilen++;
1341             }
1342           }
1343           imat_ilen[row] = ilen;
1344         }
1345       }
1346     }
1347   }
1348   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1349   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1350   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1351   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1352   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1353 
1354   /* Restore the indices */
1355   for (i=0; i<ismax; i++) {
1356     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1357     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1358   }
1359 
1360   /* Destroy allocated memory */
1361   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1362   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1363   ierr = PetscFree(pa);CHKERRQ(ierr);
1364 
1365   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1366   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1367   for (i=0; i<nrqr; ++i) {
1368     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1369   }
1370   for (i=0; i<nrqs; ++i) {
1371     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1372     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1373   }
1374 
1375   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1376   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1377   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1378   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1379   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1380   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1381   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1382 
1383   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1384   ierr = PetscFree(cmap);CHKERRQ(ierr);
1385   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1386   ierr = PetscFree(rmap);CHKERRQ(ierr);
1387   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1388   ierr = PetscFree(lens);CHKERRQ(ierr);
1389 
1390   for (i=0; i<ismax; i++) {
1391     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1392     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1393   }
1394   PetscFunctionReturn(0);
1395 }
1396 
1397 /*
1398  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1399  Be careful not to destroy them elsewhere.
1400  */
1401 #undef __FUNCT__
1402 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1403 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1404 {
1405   /* If making this function public, change the error returned in this function away from _PLIB. */
1406   PetscErrorCode ierr;
1407   Mat_MPIAIJ *aij;
1408   PetscBool seqaij;
1409 
1410   PetscFunctionBegin;
1411   /* Check to make sure the component matrices are compatible with C. */
1412   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1413   if(!seqaij) {
1414     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1415   }
1416   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1417   if(!seqaij) {
1418     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1419   }
1420   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1421     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1422   }
1423   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1424   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1425   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1426   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1427   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1428   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1429   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1430     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1431   }
1432   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1433   aij = (Mat_MPIAIJ*)((*C)->data);
1434   aij->A = A;
1435   aij->B = B;
1436   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1437   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1438   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1439   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 #undef __FUNCT__
1444 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1445 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1446 {
1447   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1448   PetscFunctionBegin;
1449   PetscValidPointer(A,2);
1450   PetscValidPointer(B,3);
1451   *A = aij->A;
1452   *B = aij->B;
1453   /* Note that we don't incref *A and *B, so be careful! */
1454   PetscFunctionReturn(0);
1455 }
1456 
1457 
1458 #undef __FUNCT__
1459 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1460 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1461 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1462 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1463 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1464 {
1465   PetscErrorCode ierr;
1466   PetscMPIInt size, flag;
1467   PetscInt    i,ii;
1468   PetscInt    ismax_c;
1469   PetscFunctionBegin;
1470   if(!ismax) {
1471     PetscFunctionReturn(0);
1472   }
1473   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1474     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1475     if(flag != MPI_IDENT) {
1476       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1477     }
1478     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1479     if(size > 1) {
1480       ++ismax_c;
1481     }
1482   }
1483   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1484     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1485   }
1486   else { /* if(ismax_c) */
1487     Mat *A,*B;
1488     IS  *isrow_c, *iscol_c;
1489     PetscMPIInt size;
1490     /*
1491      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1492      array of sequential matrices underlying the resulting parallel matrices.
1493      Which arrays to allocate is based on the value of MatReuse scall.
1494      There are as many diag matrices as there are original index sets.
1495      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1496 
1497      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1498      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1499      will deposite the extracted diag and off-diag parts.
1500      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1501      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1502     */
1503 
1504     /* Parallel matrix array is allocated only if no reuse is taking place. */
1505     if (scall != MAT_REUSE_MATRIX) {
1506       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1507     }
1508     else {
1509       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1510       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1511       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1512       for(i = 0, ii = 0; i < ismax; ++i) {
1513         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1514         if(size > 1) {
1515           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1516           ++ii;
1517         }
1518         else {
1519           A[i] = (*submat)[i];
1520         }
1521       }
1522     }
1523     /*
1524      Construct the complements of the iscol ISs for parallel ISs only.
1525      These are used to extract the off-diag portion of the resulting parallel matrix.
1526      The row IS for the off-diag portion is the same as for the diag portion,
1527      so we merely alias the row IS, while skipping those that are sequential.
1528     */
1529     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1530     for(i = 0, ii = 0; i < ismax; ++i) {
1531       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1532       if(size > 1) {
1533 	isrow_c[ii] = isrow[i];
1534 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1535 	++ii;
1536       }
1537     }
1538     /* Now obtain the sequential A and B submatrices separately. */
1539     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1540     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1541     for(ii = 0; ii < ismax_c; ++ii) {
1542       ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr);
1543     }
1544     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1545     /*
1546      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1547      have been extracted directly into the parallel matrices containing them, or
1548      simply into the sequential matrix identical with the corresponding A (if size == 1).
1549      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1550      A is put into the correct submat slot (if size == 1).
1551      */
1552     if(scall != MAT_REUSE_MATRIX) {
1553       for(i = 0, ii = 0; i < ismax; ++i) {
1554         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1555         if(size > 1) {
1556           /*
1557            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1558            */
1559           /* Construct submat[i] from the Seq pieces A and B. */
1560           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1561 
1562           ++ii;
1563         }
1564         else {
1565           (*submat)[i] = A[i];
1566         }
1567       }
1568     }
1569     ierr = PetscFree(A); CHKERRQ(ierr);
1570     ierr = PetscFree(B); CHKERRQ(ierr);
1571   }
1572   PetscFunctionReturn(0);
1573 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1574 
1575 
1576 
1577 #undef __FUNCT__
1578 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1579 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1580 {
1581   PetscErrorCode ierr;
1582   PetscFunctionBegin;
1583   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1584                                            MatCreateMPIAIJFromSeqMatrices_Private,
1585                                            MatMPIAIJExtractSeqMatrices_Private);
1586   CHKERRQ(ierr);
1587   PetscFunctionReturn(0);
1588 }
1589