xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 647798804180922dfb980ca29d2b49fa46af1416)
1 
2 #define PETSCMAT_DLL
3 
4 /*
5    Routines to compute overlapping regions of a parallel MPI matrix
6   and to find submatrices that were shared across processors.
7 */
8 #include "../src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
19 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20 {
21   PetscErrorCode ierr;
22   PetscInt       i;
23 
24   PetscFunctionBegin;
25   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26   for (i=0; i<ov; ++i) {
27     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
28   }
29   PetscFunctionReturn(0);
30 }
31 
32 /*
33   Sample message format:
34   If a processor A wants processor B to process some elements corresponding
35   to index sets is[1],is[5]
36   mesg [0] = 2   (no of index sets in the mesg)
37   -----------
38   mesg [1] = 1 => is[1]
39   mesg [2] = sizeof(is[1]);
40   -----------
41   mesg [3] = 5  => is[5]
42   mesg [4] = sizeof(is[5]);
43   -----------
44   mesg [5]
45   mesg [n]  datas[1]
46   -----------
47   mesg[n+1]
48   mesg[m]  data(is[5])
49   -----------
50 
51   Notes:
52   nrqs - no of requests sent (or to be sent out)
53   nrqr - no of requests recieved (which have to be or which have been processed
54 */
55 #undef __FUNCT__
56 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
57 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
58 {
59   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
60   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
61   const PetscInt **idx,*idx_i;
62   PetscInt       *n,*rtable,**data,len;
63   PetscErrorCode ierr;
64   PetscMPIInt    size,rank,tag1,tag2;
65   PetscInt       m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
66   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
67   PetscBT        *table;
68   MPI_Comm       comm;
69   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
70   MPI_Status     *s_status,*recv_status;
71 
72   PetscFunctionBegin;
73   comm   = ((PetscObject)C)->comm;
74   size   = c->size;
75   rank   = c->rank;
76   m      = C->rmap->N;
77 
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
79   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
80 
81   ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr);
82 
83   for (i=0; i<imax; i++) {
84     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
85     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
86   }
87 
88   /* Create hash table for the mapping :row -> proc*/
89   for (i=0,j=0; i<size; i++) {
90     len = C->rmap->range[i+1];
91     for (; j<len; j++) {
92       rtable[j] = i;
93     }
94   }
95 
96   /* evaluate communication - mesg to who,length of mesg, and buffer space
97      required. Based on this, buffers are allocated, and data copied into them*/
98   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
99   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
100   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
101   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
102   for (i=0; i<imax; i++) {
103     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
104     idx_i = idx[i];
105     len   = n[i];
106     for (j=0; j<len; j++) {
107       row  = idx_i[j];
108       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
109       proc = rtable[row];
110       w4[proc]++;
111     }
112     for (j=0; j<size; j++){
113       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
114     }
115   }
116 
117   nrqs     = 0;              /* no of outgoing messages */
118   msz      = 0;              /* total mesg length (for all proc */
119   w1[rank] = 0;              /* no mesg sent to intself */
120   w3[rank] = 0;
121   for (i=0; i<size; i++) {
122     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
123   }
124   /* pa - is list of processors to communicate with */
125   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
126   for (i=0,j=0; i<size; i++) {
127     if (w1[i]) {pa[j] = i; j++;}
128   }
129 
130   /* Each message would have a header = 1 + 2*(no of IS) + data */
131   for (i=0; i<nrqs; i++) {
132     j      = pa[i];
133     w1[j] += w2[j] + 2*w3[j];
134     msz   += w1[j];
135   }
136 
137   /* Determine the number of messages to expect, their lengths, from from-ids */
138   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
139   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
140 
141   /* Now post the Irecvs corresponding to these messages */
142   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
143 
144   /* Allocate Memory for outgoing messages */
145   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
146   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
147   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
148 
149   {
150     PetscInt *iptr = tmp,ict  = 0;
151     for (i=0; i<nrqs; i++) {
152       j         = pa[i];
153       iptr     +=  ict;
154       outdat[j] = iptr;
155       ict       = w1[j];
156     }
157   }
158 
159   /* Form the outgoing messages */
160   /*plug in the headers*/
161   for (i=0; i<nrqs; i++) {
162     j            = pa[i];
163     outdat[j][0] = 0;
164     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
165     ptr[j]       = outdat[j] + 2*w3[j] + 1;
166   }
167 
168   /* Memory for doing local proc's work*/
169   {
170     PetscInt  *d_p;
171     char      *t_p;
172 
173     /* should replace with PetscMallocN() */
174     ierr  = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
175       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr);
176     ierr  = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
177       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr);
178     data  = (PetscInt **)(table + imax);
179     isz   = (PetscInt  *)(data  + imax);
180     d_p   = (PetscInt  *)(isz   + imax);
181     t_p   = (char *)(d_p   + m*imax);
182     for (i=0; i<imax; i++) {
183       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
184       data[i]  = d_p + (m)*i;
185     }
186   }
187 
188   /* Parse the IS and update local tables and the outgoing buf with the data*/
189   {
190     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
191     PetscBT table_i;
192 
193     for (i=0; i<imax; i++) {
194       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
195       n_i     = n[i];
196       table_i = table[i];
197       idx_i   = idx[i];
198       data_i  = data[i];
199       isz_i   = isz[i];
200       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
201         row  = idx_i[j];
202         proc = rtable[row];
203         if (proc != rank) { /* copy to the outgoing buffer */
204           ctr[proc]++;
205           *ptr[proc] = row;
206           ptr[proc]++;
207         } else { /* Update the local table */
208           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
209         }
210       }
211       /* Update the headers for the current IS */
212       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
213         if ((ctr_j = ctr[j])) {
214           outdat_j        = outdat[j];
215           k               = ++outdat_j[0];
216           outdat_j[2*k]   = ctr_j;
217           outdat_j[2*k-1] = i;
218         }
219       }
220       isz[i] = isz_i;
221     }
222   }
223 
224   /*  Now  post the sends */
225   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
226   for (i=0; i<nrqs; ++i) {
227     j    = pa[i];
228     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
229   }
230 
231   /* No longer need the original indices*/
232   for (i=0; i<imax; ++i) {
233     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
234   }
235   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
236 
237   for (i=0; i<imax; ++i) {
238     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
239   }
240 
241   /* Do Local work*/
242   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
243 
244   /* Receive messages*/
245   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
246   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
247 
248   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
249   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
250 
251   /* Phase 1 sends are complete - deallocate buffers */
252   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
253   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
254 
255   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
256   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
257   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
258   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
259   ierr = PetscFree(rbuf);CHKERRQ(ierr);
260 
261 
262  /* Send the data back*/
263   /* Do a global reduction to know the buffer space req for incoming messages*/
264   {
265     PetscMPIInt *rw1;
266 
267     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
268     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
269 
270     for (i=0; i<nrqr; ++i) {
271       proc      = recv_status[i].MPI_SOURCE;
272       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
273       rw1[proc] = isz1[i];
274     }
275     ierr = PetscFree(onodes1);CHKERRQ(ierr);
276     ierr = PetscFree(olengths1);CHKERRQ(ierr);
277 
278     /* Determine the number of messages to expect, their lengths, from from-ids */
279     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
280     ierr = PetscFree(rw1);CHKERRQ(ierr);
281   }
282   /* Now post the Irecvs corresponding to these messages */
283   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
284 
285   /*  Now  post the sends */
286   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
287   for (i=0; i<nrqr; ++i) {
288     j    = recv_status[i].MPI_SOURCE;
289     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
290   }
291 
292   /* receive work done on other processors*/
293   {
294     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
295     PetscMPIInt idex;
296     PetscBT     table_i;
297     MPI_Status  *status2;
298 
299     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
300     for (i=0; i<nrqs; ++i) {
301       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
302       /* Process the message*/
303       rbuf2_i = rbuf2[idex];
304       ct1     = 2*rbuf2_i[0]+1;
305       jmax    = rbuf2[idex][0];
306       for (j=1; j<=jmax; j++) {
307         max     = rbuf2_i[2*j];
308         is_no   = rbuf2_i[2*j-1];
309         isz_i   = isz[is_no];
310         data_i  = data[is_no];
311         table_i = table[is_no];
312         for (k=0; k<max; k++,ct1++) {
313           row = rbuf2_i[ct1];
314           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
315         }
316         isz[is_no] = isz_i;
317       }
318     }
319 
320     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
321     ierr = PetscFree(status2);CHKERRQ(ierr);
322   }
323 
324   for (i=0; i<imax; ++i) {
325     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
326   }
327 
328   ierr = PetscFree(onodes2);CHKERRQ(ierr);
329   ierr = PetscFree(olengths2);CHKERRQ(ierr);
330 
331   ierr = PetscFree(pa);CHKERRQ(ierr);
332   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
333   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
334   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
335   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
337   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(table);CHKERRQ(ierr);
339   ierr = PetscFree(s_status);CHKERRQ(ierr);
340   ierr = PetscFree(recv_status);CHKERRQ(ierr);
341   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
342   ierr = PetscFree(xdata);CHKERRQ(ierr);
343   ierr = PetscFree(isz1);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
349 /*
350    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
351        the work on the local processor.
352 
353      Inputs:
354       C      - MAT_MPIAIJ;
355       imax - total no of index sets processed at a time;
356       table  - an array of char - size = m bits.
357 
358      Output:
359       isz    - array containing the count of the solution elements correspondign
360                to each index set;
361       data   - pointer to the solutions
362 */
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
364 {
365   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
366   Mat        A = c->A,B = c->B;
367   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
368   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
369   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
370   PetscBT    table_i;
371 
372   PetscFunctionBegin;
373   rstart = C->rmap->rstart;
374   cstart = C->cmap->rstart;
375   ai     = a->i;
376   aj     = a->j;
377   bi     = b->i;
378   bj     = b->j;
379   garray = c->garray;
380 
381 
382   for (i=0; i<imax; i++) {
383     data_i  = data[i];
384     table_i = table[i];
385     isz_i   = isz[i];
386     for (j=0,max=isz[i]; j<max; j++) {
387       row   = data_i[j] - rstart;
388       start = ai[row];
389       end   = ai[row+1];
390       for (k=start; k<end; k++) { /* Amat */
391         val = aj[k] + cstart;
392         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
393       }
394       start = bi[row];
395       end   = bi[row+1];
396       for (k=start; k<end; k++) { /* Bmat */
397         val = garray[bj[k]];
398         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399       }
400     }
401     isz[i] = isz_i;
402   }
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
408 /*
409       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
410          and return the output
411 
412          Input:
413            C    - the matrix
414            nrqr - no of messages being processed.
415            rbuf - an array of pointers to the recieved requests
416 
417          Output:
418            xdata - array of messages to be sent back
419            isz1  - size of each message
420 
421   For better efficiency perhaps we should malloc separately each xdata[i],
422 then if a remalloc is required we need only copy the data for that one row
423 rather then all previous rows as it is now where a single large chunck of
424 memory is used.
425 
426 */
427 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
428 {
429   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
430   Mat            A = c->A,B = c->B;
431   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
432   PetscErrorCode ierr;
433   PetscMPIInt    rank;
434   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
435   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
436   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
437   PetscInt       *rbuf_i,kmax,rbuf_0;
438   PetscBT        xtable;
439 
440   PetscFunctionBegin;
441   rank   = c->rank;
442   m      = C->rmap->N;
443   rstart = C->rmap->rstart;
444   cstart = C->cmap->rstart;
445   ai     = a->i;
446   aj     = a->j;
447   bi     = b->i;
448   bj     = b->j;
449   garray = c->garray;
450 
451 
452   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
453     rbuf_i  =  rbuf[i];
454     rbuf_0  =  rbuf_i[0];
455     ct     += rbuf_0;
456     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
457   }
458 
459   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
460   else      max1 = 1;
461   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
462   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
463   ++no_malloc;
464   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
465   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
466 
467   ct3 = 0;
468   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
469     rbuf_i =  rbuf[i];
470     rbuf_0 =  rbuf_i[0];
471     ct1    =  2*rbuf_0+1;
472     ct2    =  ct1;
473     ct3    += ct1;
474     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
475       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
476       oct2 = ct2;
477       kmax = rbuf_i[2*j];
478       for (k=0; k<kmax; k++,ct1++) {
479         row = rbuf_i[ct1];
480         if (!PetscBTLookupSet(xtable,row)) {
481           if (!(ct3 < mem_estimate)) {
482             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
483             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
484             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
485             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
486             xdata[0]     = tmp;
487             mem_estimate = new_estimate; ++no_malloc;
488             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
489           }
490           xdata[i][ct2++] = row;
491           ct3++;
492         }
493       }
494       for (k=oct2,max2=ct2; k<max2; k++) {
495         row   = xdata[i][k] - rstart;
496         start = ai[row];
497         end   = ai[row+1];
498         for (l=start; l<end; l++) {
499           val = aj[l] + cstart;
500           if (!PetscBTLookupSet(xtable,val)) {
501             if (!(ct3 < mem_estimate)) {
502               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
503               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
504               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
505               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
506               xdata[0]     = tmp;
507               mem_estimate = new_estimate; ++no_malloc;
508               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
509             }
510             xdata[i][ct2++] = val;
511             ct3++;
512           }
513         }
514         start = bi[row];
515         end   = bi[row+1];
516         for (l=start; l<end; l++) {
517           val = garray[bj[l]];
518           if (!PetscBTLookupSet(xtable,val)) {
519             if (!(ct3 < mem_estimate)) {
520               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
521               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
522               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
523               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
524               xdata[0]     = tmp;
525               mem_estimate = new_estimate; ++no_malloc;
526               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
527             }
528             xdata[i][ct2++] = val;
529             ct3++;
530           }
531         }
532       }
533       /* Update the header*/
534       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
535       xdata[i][2*j-1] = rbuf_i[2*j-1];
536     }
537     xdata[i][0] = rbuf_0;
538     xdata[i+1]  = xdata[i] + ct2;
539     isz1[i]     = ct2; /* size of each message */
540   }
541   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
542   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
543   PetscFunctionReturn(0);
544 }
545 /* -------------------------------------------------------------------------*/
546 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
547 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
548 /*
549     Every processor gets the entire matrix
550 */
551 #undef __FUNCT__
552 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
553 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
554 {
555   Mat            B;
556   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
557   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
558   PetscErrorCode ierr;
559   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
560   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
561   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
562   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
563 
564   PetscFunctionBegin;
565   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
566   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
567 
568   if (scall == MAT_INITIAL_MATRIX) {
569     /* ----------------------------------------------------------------
570          Tell every processor the number of nonzeros per row
571     */
572     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
573     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
574       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
575     }
576     sendcount = A->rmap->rend - A->rmap->rstart;
577     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
578     for (i=0; i<size; i++) {
579       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
580       displs[i]     = A->rmap->range[i];
581     }
582 #if defined(PETSC_HAVE_MPI_IN_PLACE)
583     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
584 #else
585     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
586 #endif
587     /* ---------------------------------------------------------------
588          Create the sequential matrix of the same type as the local block diagonal
589     */
590     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
591     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
592     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
593     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
594     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
595     **Bin = B;
596     b = (Mat_SeqAIJ *)B->data;
597 
598     /*--------------------------------------------------------------------
599        Copy my part of matrix column indices over
600     */
601     sendcount  = ad->nz + bd->nz;
602     jsendbuf   = b->j + b->i[rstarts[rank]];
603     a_jsendbuf = ad->j;
604     b_jsendbuf = bd->j;
605     n          = A->rmap->rend - A->rmap->rstart;
606     cnt        = 0;
607     for (i=0; i<n; i++) {
608 
609       /* put in lower diagonal portion */
610       m = bd->i[i+1] - bd->i[i];
611       while (m > 0) {
612         /* is it above diagonal (in bd (compressed) numbering) */
613         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
614         jsendbuf[cnt++] = garray[*b_jsendbuf++];
615         m--;
616       }
617 
618       /* put in diagonal portion */
619       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
620         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
621       }
622 
623       /* put in upper diagonal portion */
624       while (m-- > 0) {
625         jsendbuf[cnt++] = garray[*b_jsendbuf++];
626       }
627     }
628     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
629 
630     /*--------------------------------------------------------------------
631        Gather all column indices to all processors
632     */
633     for (i=0; i<size; i++) {
634       recvcounts[i] = 0;
635       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
636         recvcounts[i] += lens[j];
637       }
638     }
639     displs[0]  = 0;
640     for (i=1; i<size; i++) {
641       displs[i] = displs[i-1] + recvcounts[i-1];
642     }
643 #if defined(PETSC_HAVE_MPI_IN_PLACE)
644     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
645 #else
646     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
647 #endif
648     /*--------------------------------------------------------------------
649         Assemble the matrix into useable form (note numerical values not yet set)
650     */
651     /* set the b->ilen (length of each row) values */
652     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
653     /* set the b->i indices */
654     b->i[0] = 0;
655     for (i=1; i<=A->rmap->N; i++) {
656       b->i[i] = b->i[i-1] + lens[i-1];
657     }
658     ierr = PetscFree(lens);CHKERRQ(ierr);
659     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
660     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
661 
662   } else {
663     B  = **Bin;
664     b = (Mat_SeqAIJ *)B->data;
665   }
666 
667   /*--------------------------------------------------------------------
668        Copy my part of matrix numerical values into the values location
669   */
670   if (flag == MAT_GET_VALUES){
671     sendcount = ad->nz + bd->nz;
672     sendbuf   = b->a + b->i[rstarts[rank]];
673     a_sendbuf = ad->a;
674     b_sendbuf = bd->a;
675     b_sendj   = bd->j;
676     n         = A->rmap->rend - A->rmap->rstart;
677     cnt       = 0;
678     for (i=0; i<n; i++) {
679 
680       /* put in lower diagonal portion */
681       m = bd->i[i+1] - bd->i[i];
682       while (m > 0) {
683         /* is it above diagonal (in bd (compressed) numbering) */
684         if (garray[*b_sendj] > A->rmap->rstart + i) break;
685         sendbuf[cnt++] = *b_sendbuf++;
686         m--;
687         b_sendj++;
688       }
689 
690       /* put in diagonal portion */
691       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
692         sendbuf[cnt++] = *a_sendbuf++;
693       }
694 
695       /* put in upper diagonal portion */
696       while (m-- > 0) {
697         sendbuf[cnt++] = *b_sendbuf++;
698         b_sendj++;
699       }
700     }
701     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
702 
703     /* -----------------------------------------------------------------
704        Gather all numerical values to all processors
705     */
706     if (!recvcounts) {
707       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
708     }
709     for (i=0; i<size; i++) {
710       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
711     }
712     displs[0]  = 0;
713     for (i=1; i<size; i++) {
714       displs[i] = displs[i-1] + recvcounts[i-1];
715     }
716     recvbuf   = b->a;
717 #if defined(PETSC_HAVE_MPI_IN_PLACE)
718     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
719 #else
720     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
721 #endif
722   }  /* endof (flag == MAT_GET_VALUES) */
723   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
724 
725   if (A->symmetric){
726     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
727   } else if (A->hermitian) {
728     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
729   } else if (A->structurally_symmetric) {
730     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
731   }
732   PetscFunctionReturn(0);
733 }
734 
735 
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
739 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
740 {
741   PetscErrorCode ierr;
742   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
743   PetscBool      rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
744 
745   PetscFunctionBegin;
746   /*
747        Check for special case each processor gets entire matrix
748   */
749   if (ismax == 1 && C->rmap->N == C->cmap->N) {
750     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
751     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
752     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
753     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
754     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
755       wantallmatrix = PETSC_TRUE;
756       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
757     }
758   }
759   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
760   if (twantallmatrix) {
761     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
762     PetscFunctionReturn(0);
763   }
764 
765   /* Allocate memory to hold all the submatrices */
766   if (scall != MAT_REUSE_MATRIX) {
767     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
768   }
769   /* Determine the number of stages through which submatrices are done */
770   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
771   /*
772      Each stage will extract nmax submatrices.
773      nmax is determined by the matrix column dimension.
774      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
775   */
776   if (!nmax) nmax = 1;
777   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
778 
779   /* Make sure every processor loops through the nstages */
780   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
781 
782   for (i=0,pos=0; i<nstages; i++) {
783     if (pos+nmax <= ismax) max_no = nmax;
784     else if (pos == ismax) max_no = 0;
785     else                   max_no = ismax-pos;
786     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
787     pos += max_no;
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 
793 
794 
795 /* -------------------------------------------------------------------------*/
796 #undef __FUNCT__
797 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
798 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
799 {
800   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
801   Mat            A = c->A;
802   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
803   const PetscInt **icol,**irow;
804   PetscInt       *nrow,*ncol,start;
805   PetscErrorCode ierr;
806   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
807   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
808   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
809   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
810   PetscInt       **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
811   const PetscInt *irow_i;
812   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
813   PetscInt       *rmap_i;
814   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
815   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
816   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
817   MPI_Status     *r_status3,*r_status4,*s_status4;
818   MPI_Comm       comm;
819   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
820   PetscBool      sorted;
821   PetscMPIInt    *onodes1,*olengths1;
822   PetscMPIInt    idex,idex2,end;
823 
824   PetscFunctionBegin;
825   comm   = ((PetscObject)C)->comm;
826   tag0   = ((PetscObject)C)->tag;
827   size   = c->size;
828   rank   = c->rank;
829 
830   /* Get some new tags to keep the communication clean */
831   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
832   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
833   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
834 
835     /* Check if the col indices are sorted */
836   for (i=0; i<ismax; i++) {
837     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
838     /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
839     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
840     /*    if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
841   }
842   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
843 
844   for (i=0; i<ismax; i++) {
845     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
846     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
847     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
848     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
849   }
850 
851   /* evaluate communication - mesg to who, length of mesg, and buffer space
852      required. Based on this, buffers are allocated, and data copied into them*/
853   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
854   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
855   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
856   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
857   for (i=0; i<ismax; i++) {
858     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
859     jmax   = nrow[i];
860     irow_i = irow[i];
861     for (j=0; j<jmax; j++) {
862       l = 0;
863       row  = irow_i[j];
864       while (row >= C->rmap->range[l+1]) l++;
865       proc = l;
866       w4[proc]++;
867     }
868     for (j=0; j<size; j++) {
869       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
870     }
871   }
872 
873   nrqs     = 0;              /* no of outgoing messages */
874   msz      = 0;              /* total mesg length (for all procs) */
875   w1[rank] = 0;              /* no mesg sent to self */
876   w3[rank] = 0;
877   for (i=0; i<size; i++) {
878     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
879   }
880   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
881   for (i=0,j=0; i<size; i++) {
882     if (w1[i]) { pa[j] = i; j++; }
883   }
884 
885   /* Each message would have a header = 1 + 2*(no of IS) + data */
886   for (i=0; i<nrqs; i++) {
887     j     = pa[i];
888     w1[j] += w2[j] + 2* w3[j];
889     msz   += w1[j];
890   }
891 
892   /* Determine the number of messages to expect, their lengths, from from-ids */
893   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
894   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
895 
896   /* Now post the Irecvs corresponding to these messages */
897   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
898 
899   ierr = PetscFree(onodes1);CHKERRQ(ierr);
900   ierr = PetscFree(olengths1);CHKERRQ(ierr);
901 
902   /* Allocate Memory for outgoing messages */
903   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
904   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
905   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
906 
907   {
908     PetscInt *iptr = tmp,ict = 0;
909     for (i=0; i<nrqs; i++) {
910       j         = pa[i];
911       iptr     += ict;
912       sbuf1[j]  = iptr;
913       ict       = w1[j];
914     }
915   }
916 
917   /* Form the outgoing messages */
918   /* Initialize the header space */
919   for (i=0; i<nrqs; i++) {
920     j           = pa[i];
921     sbuf1[j][0] = 0;
922     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
923     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
924   }
925 
926   /* Parse the isrow and copy data into outbuf */
927   for (i=0; i<ismax; i++) {
928     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
929     irow_i = irow[i];
930     jmax   = nrow[i];
931     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
932       l = 0;
933       row  = irow_i[j];
934       while (row >= C->rmap->range[l+1]) l++;
935       proc = l;
936       if (proc != rank) { /* copy to the outgoing buf*/
937         ctr[proc]++;
938         *ptr[proc] = row;
939         ptr[proc]++;
940       }
941     }
942     /* Update the headers for the current IS */
943     for (j=0; j<size; j++) { /* Can Optimise this loop too */
944       if ((ctr_j = ctr[j])) {
945         sbuf1_j        = sbuf1[j];
946         k              = ++sbuf1_j[0];
947         sbuf1_j[2*k]   = ctr_j;
948         sbuf1_j[2*k-1] = i;
949       }
950     }
951   }
952 
953   /*  Now  post the sends */
954   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
955   for (i=0; i<nrqs; ++i) {
956     j    = pa[i];
957     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
958   }
959 
960   /* Post Receives to capture the buffer size */
961   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
962   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
963   rbuf2[0] = tmp + msz;
964   for (i=1; i<nrqs; ++i) {
965     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
966   }
967   for (i=0; i<nrqs; ++i) {
968     j    = pa[i];
969     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
970   }
971 
972   /* Send to other procs the buf size they should allocate */
973 
974 
975   /* Receive messages*/
976   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
977   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
978   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
979   {
980     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
981     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
982     PetscInt    *sbuf2_i;
983 
984     for (i=0; i<nrqr; ++i) {
985       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
986       req_size[idex] = 0;
987       rbuf1_i         = rbuf1[idex];
988       start           = 2*rbuf1_i[0] + 1;
989       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
990       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
991       sbuf2_i         = sbuf2[idex];
992       for (j=start; j<end; j++) {
993         id               = rbuf1_i[j] - rstart;
994         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
995         sbuf2_i[j]       = ncols;
996         req_size[idex] += ncols;
997       }
998       req_source[idex] = r_status1[i].MPI_SOURCE;
999       /* form the header */
1000       sbuf2_i[0]   = req_size[idex];
1001       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1002       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1003     }
1004   }
1005   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1006   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1007 
1008   /*  recv buffer sizes */
1009   /* Receive messages*/
1010 
1011   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1012   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1013   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1014   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1015   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1016 
1017   for (i=0; i<nrqs; ++i) {
1018     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1019     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1020     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1021     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1022     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1023   }
1024   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1025   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1026 
1027   /* Wait on sends1 and sends2 */
1028   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1029   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1030 
1031   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1032   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1033   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1034   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1035   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1036   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1037 
1038   /* Now allocate buffers for a->j, and send them off */
1039   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1040   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1041   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1042   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1043 
1044   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1045   {
1046     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1047     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1048     PetscInt cend = C->cmap->rend;
1049     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1050 
1051     for (i=0; i<nrqr; i++) {
1052       rbuf1_i   = rbuf1[i];
1053       sbuf_aj_i = sbuf_aj[i];
1054       ct1       = 2*rbuf1_i[0] + 1;
1055       ct2       = 0;
1056       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1057         kmax = rbuf1[i][2*j];
1058         for (k=0; k<kmax; k++,ct1++) {
1059           row    = rbuf1_i[ct1] - rstart;
1060           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1061           ncols  = nzA + nzB;
1062           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1063 
1064           /* load the column indices for this row into cols*/
1065           cols  = sbuf_aj_i + ct2;
1066 
1067 	  lwrite = 0;
1068           for (l=0; l<nzB; l++) {
1069             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1070           }
1071           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1072           for (l=0; l<nzB; l++) {
1073             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1074           }
1075 
1076           ct2 += ncols;
1077         }
1078       }
1079       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1080     }
1081   }
1082   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1083   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1084 
1085   /* Allocate buffers for a->a, and send them off */
1086   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1087   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1088   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1089   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1090 
1091   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1092   {
1093     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1094     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1095     PetscInt    cend = C->cmap->rend;
1096     PetscInt    *b_j = b->j;
1097     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1098 
1099     for (i=0; i<nrqr; i++) {
1100       rbuf1_i   = rbuf1[i];
1101       sbuf_aa_i = sbuf_aa[i];
1102       ct1       = 2*rbuf1_i[0]+1;
1103       ct2       = 0;
1104       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1105         kmax = rbuf1_i[2*j];
1106         for (k=0; k<kmax; k++,ct1++) {
1107           row    = rbuf1_i[ct1] - rstart;
1108           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1109           ncols  = nzA + nzB;
1110           cworkB = b_j + b_i[row];
1111           vworkA = a_a + a_i[row];
1112           vworkB = b_a + b_i[row];
1113 
1114           /* load the column values for this row into vals*/
1115           vals  = sbuf_aa_i+ct2;
1116 
1117 	  lwrite = 0;
1118           for (l=0; l<nzB; l++) {
1119             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1120           }
1121           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1122           for (l=0; l<nzB; l++) {
1123             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1124           }
1125 
1126           ct2 += ncols;
1127         }
1128       }
1129       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1130     }
1131   }
1132   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1133   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1134   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1135   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1136 
1137   /* Form the matrix */
1138   /* create col map */
1139   {
1140     const PetscInt *icol_i;
1141 
1142     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1143     ierr    = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1144     ierr    = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1145     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1146     for (i=0; i<ismax; i++) {
1147       jmax   = ncol[i];
1148       icol_i = icol[i];
1149       cmap_i = cmap[i];
1150       for (j=0; j<jmax; j++) {
1151         cmap_i[icol_i[j]] = j+1;
1152       }
1153     }
1154   }
1155 
1156   /* Create lens which is required for MatCreate... */
1157   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1158   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1159   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1160   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1161   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1162 
1163   /* Update lens from local data */
1164   for (i=0; i<ismax; i++) {
1165     jmax   = nrow[i];
1166     cmap_i = cmap[i];
1167     irow_i = irow[i];
1168     lens_i = lens[i];
1169     for (j=0; j<jmax; j++) {
1170       l = 0;
1171       row  = irow_i[j];
1172       while (row >= C->rmap->range[l+1]) l++;
1173       proc = l;
1174       if (proc == rank) {
1175         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1176         for (k=0; k<ncols; k++) {
1177           if (cmap_i[cols[k]]) { lens_i[j]++;}
1178         }
1179         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1180       }
1181     }
1182   }
1183 
1184   /* Create row map*/
1185   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1186   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1187   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1188   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1189   for (i=0; i<ismax; i++) {
1190     rmap_i = rmap[i];
1191     irow_i = irow[i];
1192     jmax   = nrow[i];
1193     for (j=0; j<jmax; j++) {
1194       rmap_i[irow_i[j]] = j;
1195     }
1196   }
1197 
1198   /* Update lens from offproc data */
1199   {
1200     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1201 
1202     for (tmp2=0; tmp2<nrqs; tmp2++) {
1203       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1204       idex   = pa[idex2];
1205       sbuf1_i = sbuf1[idex];
1206       jmax    = sbuf1_i[0];
1207       ct1     = 2*jmax+1;
1208       ct2     = 0;
1209       rbuf2_i = rbuf2[idex2];
1210       rbuf3_i = rbuf3[idex2];
1211       for (j=1; j<=jmax; j++) {
1212         is_no   = sbuf1_i[2*j-1];
1213         max1    = sbuf1_i[2*j];
1214         lens_i  = lens[is_no];
1215         cmap_i  = cmap[is_no];
1216         rmap_i  = rmap[is_no];
1217         for (k=0; k<max1; k++,ct1++) {
1218           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1219           max2 = rbuf2_i[ct1];
1220           for (l=0; l<max2; l++,ct2++) {
1221             if (cmap_i[rbuf3_i[ct2]]) {
1222               lens_i[row]++;
1223             }
1224           }
1225         }
1226       }
1227     }
1228   }
1229   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1230   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1231   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1232   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1233   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1234 
1235   /* Create the submatrices */
1236   if (scall == MAT_REUSE_MATRIX) {
1237     PetscBool  flag;
1238 
1239     /*
1240         Assumes new rows are same length as the old rows,hence bug!
1241     */
1242     for (i=0; i<ismax; i++) {
1243       mat = (Mat_SeqAIJ *)(submats[i]->data);
1244       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1245       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1246       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1247       /* Initial matrix as if empty */
1248       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1249       submats[i]->factortype = C->factortype;
1250     }
1251   } else {
1252     for (i=0; i<ismax; i++) {
1253       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1254       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1255       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1256       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1257     }
1258   }
1259 
1260   /* Assemble the matrices */
1261   /* First assemble the local rows */
1262   {
1263     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1264     PetscScalar *imat_a;
1265 
1266     for (i=0; i<ismax; i++) {
1267       mat       = (Mat_SeqAIJ*)submats[i]->data;
1268       imat_ilen = mat->ilen;
1269       imat_j    = mat->j;
1270       imat_i    = mat->i;
1271       imat_a    = mat->a;
1272       cmap_i    = cmap[i];
1273       rmap_i    = rmap[i];
1274       irow_i    = irow[i];
1275       jmax      = nrow[i];
1276       for (j=0; j<jmax; j++) {
1277 	l = 0;
1278         row      = irow_i[j];
1279         while (row >= C->rmap->range[l+1]) l++;
1280         proc = l;
1281         if (proc == rank) {
1282           old_row  = row;
1283           row      = rmap_i[row];
1284           ilen_row = imat_ilen[row];
1285           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1286           mat_i    = imat_i[row] ;
1287           mat_a    = imat_a + mat_i;
1288           mat_j    = imat_j + mat_i;
1289           for (k=0; k<ncols; k++) {
1290             if ((tcol = cmap_i[cols[k]])) {
1291               *mat_j++ = tcol - 1;
1292               *mat_a++ = vals[k];
1293               ilen_row++;
1294             }
1295           }
1296           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1297           imat_ilen[row] = ilen_row;
1298         }
1299       }
1300     }
1301   }
1302 
1303   /*   Now assemble the off proc rows*/
1304   {
1305     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1306     PetscInt    *imat_j,*imat_i;
1307     PetscScalar *imat_a,*rbuf4_i;
1308 
1309     for (tmp2=0; tmp2<nrqs; tmp2++) {
1310       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1311       idex   = pa[idex2];
1312       sbuf1_i = sbuf1[idex];
1313       jmax    = sbuf1_i[0];
1314       ct1     = 2*jmax + 1;
1315       ct2     = 0;
1316       rbuf2_i = rbuf2[idex2];
1317       rbuf3_i = rbuf3[idex2];
1318       rbuf4_i = rbuf4[idex2];
1319       for (j=1; j<=jmax; j++) {
1320         is_no     = sbuf1_i[2*j-1];
1321         rmap_i    = rmap[is_no];
1322         cmap_i    = cmap[is_no];
1323         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1324         imat_ilen = mat->ilen;
1325         imat_j    = mat->j;
1326         imat_i    = mat->i;
1327         imat_a    = mat->a;
1328         max1      = sbuf1_i[2*j];
1329         for (k=0; k<max1; k++,ct1++) {
1330           row   = sbuf1_i[ct1];
1331           row   = rmap_i[row];
1332           ilen  = imat_ilen[row];
1333           mat_i = imat_i[row] ;
1334           mat_a = imat_a + mat_i;
1335           mat_j = imat_j + mat_i;
1336           max2 = rbuf2_i[ct1];
1337           for (l=0; l<max2; l++,ct2++) {
1338             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1339               *mat_j++ = tcol - 1;
1340               *mat_a++ = rbuf4_i[ct2];
1341               ilen++;
1342             }
1343           }
1344           imat_ilen[row] = ilen;
1345         }
1346       }
1347     }
1348   }
1349   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1350   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1351   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1352   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1353   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1354 
1355   /* Restore the indices */
1356   for (i=0; i<ismax; i++) {
1357     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1358     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1359   }
1360 
1361   /* Destroy allocated memory */
1362   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1363   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1364   ierr = PetscFree(pa);CHKERRQ(ierr);
1365 
1366   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1367   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1368   for (i=0; i<nrqr; ++i) {
1369     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1370   }
1371   for (i=0; i<nrqs; ++i) {
1372     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1373     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1374   }
1375 
1376   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1377   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1378   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1379   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1380   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1381   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1382   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1383 
1384   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1385   ierr = PetscFree(cmap);CHKERRQ(ierr);
1386   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1387   ierr = PetscFree(rmap);CHKERRQ(ierr);
1388   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1389   ierr = PetscFree(lens);CHKERRQ(ierr);
1390 
1391   for (i=0; i<ismax; i++) {
1392     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1393     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1394   }
1395   PetscFunctionReturn(0);
1396 }
1397 
1398 /*
1399  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1400  Be careful not to destroy them elsewhere.
1401  */
1402 #undef __FUNCT__
1403 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1404 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1405 {
1406   /* If making this function public, change the error returned in this function away from _PLIB. */
1407   PetscErrorCode ierr;
1408   Mat_MPIAIJ *aij;
1409   PetscBool seqaij;
1410 
1411   PetscFunctionBegin;
1412   /* Check to make sure the component matrices are compatible with C. */
1413   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1414   if(!seqaij) {
1415     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1416   }
1417   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1418   if(!seqaij) {
1419     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1420   }
1421   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1422     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1423   }
1424   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1425   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1426   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1427   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1428   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1429   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1430   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1431     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1432   }
1433   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1434   aij = (Mat_MPIAIJ*)((*C)->data);
1435   aij->A = A;
1436   aij->B = B;
1437   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1438   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1439   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1440   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1441   PetscFunctionReturn(0);
1442 }
1443 
1444 #undef __FUNCT__
1445 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1446 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1447 {
1448   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1449   PetscFunctionBegin;
1450   PetscValidPointer(A,2);
1451   PetscValidPointer(B,3);
1452   *A = aij->A;
1453   *B = aij->B;
1454   /* Note that we don't incref *A and *B, so be careful! */
1455   PetscFunctionReturn(0);
1456 }
1457 
1458 
1459 #undef __FUNCT__
1460 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1461 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1462 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1463 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1464 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1465 {
1466   PetscErrorCode ierr;
1467   PetscMPIInt size, flag;
1468   PetscInt    i,ii;
1469   PetscInt    ismax_c;
1470   PetscFunctionBegin;
1471   if(!ismax) {
1472     PetscFunctionReturn(0);
1473   }
1474   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1475     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1476     if(flag != MPI_IDENT) {
1477       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1478     }
1479     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1480     if(size > 1) {
1481       ++ismax_c;
1482     }
1483   }
1484   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1485     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1486   }
1487   else {
1488     Mat *A,*B;
1489     IS  *isrow_c, *iscol_c;
1490     PetscMPIInt size;
1491     /*
1492      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1493      array of sequential matrices underlying the resulting parallel matrices.
1494      Which arrays to allocate is based on the value of MatReuse scall.
1495      There are as many diag matrices as there are original index sets.
1496      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1497     */
1498 
1499     /*
1500      Sequential matrix arrays are allocated in any event.
1501      Even if the array of parallel matrices already exists, we have to consolidate the underlying seq matrices.
1502     */
1503     ierr = PetscMalloc2(ismax, Mat, &A, ismax_c, Mat, &B); CHKERRQ(ierr);
1504 
1505     /* Parallel matrix array is only allocated if no reuse is taking place. */
1506     if (scall != MAT_REUSE_MATRIX) {
1507       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1508     }
1509     else {
1510       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1511       for(i = 0, ii = 0; i < ismax; ++i) {
1512         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1513         if(size > 1) {
1514           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1515           ++ii;
1516         }
1517         else {
1518           A[i] = (*submat)[i];
1519         }
1520       }
1521     }
1522     /*
1523      Construct the complements of the iscol ISs for parallel ISs only.
1524      These are used to extract the off-diag portion of the resulting parallel matrix.
1525      The row IS for the off-diag portion is the same as for the diag portion,
1526      so we merely alias the row IS, while skipping those that are sequential.
1527     */
1528     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1529     for(i = 0, ii = 0; i < ismax; ++i) {
1530       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1531       if(size > 1) {
1532 	isrow_c[ii] = isrow[i];
1533 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1534 	++ii;
1535       }
1536     }
1537     /* Now obtain the sequential A and B submatrices separately. */
1538     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1539     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1540     for(ii = 0; ii < ismax_c; ++ii) {
1541       ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr);
1542     }
1543     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1544     /*
1545      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1546      have been extracted directly into the parallel matrices containing them, or
1547      simply into the sequential matrix identical with the corresponding A (if size == 1).
1548      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1549      A is put into the correct submat slot (if size == 1).
1550      */
1551     if(scall != MAT_REUSE_MATRIX) {
1552       for(i = 0, ii = 0; i < ismax; ++i) {
1553         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1554         if(size > 1) {
1555           /*
1556            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1557            */
1558           /* Construct submat[i] from the Seq pieces A and B. */
1559           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1560 
1561           ++ii;
1562         }
1563         else {
1564           (*submat)[i] = A[i];
1565         }
1566       }
1567     }
1568     ierr = PetscFree2(A,B); CHKERRQ(ierr);
1569   }
1570   PetscFunctionReturn(0);
1571 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1572 
1573 
1574 
1575 #undef __FUNCT__
1576 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1577 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1578 {
1579   PetscErrorCode ierr;
1580   PetscFunctionBegin;
1581   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1582                                            MatCreateMPIAIJFromSeqMatrices_Private,
1583                                            MatMPIAIJExtractSeqMatrices_Private);
1584   CHKERRQ(ierr);
1585   PetscFunctionReturn(0);
1586 }
1587