xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 047b9c12be44a5e56fb8d857a97b174db49aaedf)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       m,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69 
70   PetscFunctionBegin;
71   comm   = ((PetscObject)C)->comm;
72   size   = c->size;
73   rank   = c->rank;
74   m      = C->rmap->N;
75 
76   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
78 
79   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
80 
81   for (i=0; i<imax; i++) {
82     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
83     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
84   }
85 
86   /* evaluate communication - mesg to who,length of mesg, and buffer space
87      required. Based on this, buffers are allocated, and data copied into them*/
88   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
89   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
90   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   for (i=0; i<imax; i++) {
93     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
94     idx_i = idx[i];
95     len   = n[i];
96     for (j=0; j<len; j++) {
97       row  = idx_i[j];
98       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
99       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
100       w4[proc]++;
101     }
102     for (j=0; j<size; j++){
103       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
104     }
105   }
106 
107   nrqs     = 0;              /* no of outgoing messages */
108   msz      = 0;              /* total mesg length (for all proc */
109   w1[rank] = 0;              /* no mesg sent to intself */
110   w3[rank] = 0;
111   for (i=0; i<size; i++) {
112     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
113   }
114   /* pa - is list of processors to communicate with */
115   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
116   for (i=0,j=0; i<size; i++) {
117     if (w1[i]) {pa[j] = i; j++;}
118   }
119 
120   /* Each message would have a header = 1 + 2*(no of IS) + data */
121   for (i=0; i<nrqs; i++) {
122     j      = pa[i];
123     w1[j] += w2[j] + 2*w3[j];
124     msz   += w1[j];
125   }
126 
127   /* Determine the number of messages to expect, their lengths, from from-ids */
128   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
129   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
130 
131   /* Now post the Irecvs corresponding to these messages */
132   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
133 
134   /* Allocate Memory for outgoing messages */
135   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
136   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
137   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
138 
139   {
140     PetscInt *iptr = tmp,ict  = 0;
141     for (i=0; i<nrqs; i++) {
142       j         = pa[i];
143       iptr     +=  ict;
144       outdat[j] = iptr;
145       ict       = w1[j];
146     }
147   }
148 
149   /* Form the outgoing messages */
150   /*plug in the headers*/
151   for (i=0; i<nrqs; i++) {
152     j            = pa[i];
153     outdat[j][0] = 0;
154     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
155     ptr[j]       = outdat[j] + 2*w3[j] + 1;
156   }
157 
158   /* Memory for doing local proc's work*/
159   {
160     PetscInt  *d_p;
161     char      *t_p;
162 
163     /* should replace with PetscMallocN() */
164     ierr  = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
165       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr);
166     ierr  = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
167       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr);
168     data  = (PetscInt **)(table + imax);
169     isz   = (PetscInt  *)(data  + imax);
170     d_p   = (PetscInt  *)(isz   + imax);
171     t_p   = (char *)(d_p   + m*imax);
172     for (i=0; i<imax; i++) {
173       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
174       data[i]  = d_p + (m)*i;
175     }
176   }
177 
178   /* Parse the IS and update local tables and the outgoing buf with the data*/
179   {
180     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
181     PetscBT table_i;
182 
183     for (i=0; i<imax; i++) {
184       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
185       n_i     = n[i];
186       table_i = table[i];
187       idx_i   = idx[i];
188       data_i  = data[i];
189       isz_i   = isz[i];
190       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
191         row  = idx_i[j];
192         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
193         if (proc != rank) { /* copy to the outgoing buffer */
194           ctr[proc]++;
195           *ptr[proc] = row;
196           ptr[proc]++;
197         } else { /* Update the local table */
198           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
199         }
200       }
201       /* Update the headers for the current IS */
202       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
203         if ((ctr_j = ctr[j])) {
204           outdat_j        = outdat[j];
205           k               = ++outdat_j[0];
206           outdat_j[2*k]   = ctr_j;
207           outdat_j[2*k-1] = i;
208         }
209       }
210       isz[i] = isz_i;
211     }
212   }
213 
214   /*  Now  post the sends */
215   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
216   for (i=0; i<nrqs; ++i) {
217     j    = pa[i];
218     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
219   }
220 
221   /* No longer need the original indices*/
222   for (i=0; i<imax; ++i) {
223     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
224   }
225   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
226 
227   for (i=0; i<imax; ++i) {
228     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
229   }
230 
231   /* Do Local work*/
232   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
233 
234   /* Receive messages*/
235   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
236   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
237 
238   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
239   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
240 
241   /* Phase 1 sends are complete - deallocate buffers */
242   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
243   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
244 
245   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
246   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
247   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
248   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
249   ierr = PetscFree(rbuf);CHKERRQ(ierr);
250 
251 
252  /* Send the data back*/
253   /* Do a global reduction to know the buffer space req for incoming messages*/
254   {
255     PetscMPIInt *rw1;
256 
257     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
258     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
259 
260     for (i=0; i<nrqr; ++i) {
261       proc      = recv_status[i].MPI_SOURCE;
262       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
263       rw1[proc] = isz1[i];
264     }
265     ierr = PetscFree(onodes1);CHKERRQ(ierr);
266     ierr = PetscFree(olengths1);CHKERRQ(ierr);
267 
268     /* Determine the number of messages to expect, their lengths, from from-ids */
269     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
270     ierr = PetscFree(rw1);CHKERRQ(ierr);
271   }
272   /* Now post the Irecvs corresponding to these messages */
273   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
274 
275   /*  Now  post the sends */
276   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
277   for (i=0; i<nrqr; ++i) {
278     j    = recv_status[i].MPI_SOURCE;
279     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
280   }
281 
282   /* receive work done on other processors*/
283   {
284     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
285     PetscMPIInt idex;
286     PetscBT     table_i;
287     MPI_Status  *status2;
288 
289     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
290     for (i=0; i<nrqs; ++i) {
291       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
292       /* Process the message*/
293       rbuf2_i = rbuf2[idex];
294       ct1     = 2*rbuf2_i[0]+1;
295       jmax    = rbuf2[idex][0];
296       for (j=1; j<=jmax; j++) {
297         max     = rbuf2_i[2*j];
298         is_no   = rbuf2_i[2*j-1];
299         isz_i   = isz[is_no];
300         data_i  = data[is_no];
301         table_i = table[is_no];
302         for (k=0; k<max; k++,ct1++) {
303           row = rbuf2_i[ct1];
304           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
305         }
306         isz[is_no] = isz_i;
307       }
308     }
309 
310     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
311     ierr = PetscFree(status2);CHKERRQ(ierr);
312   }
313 
314   for (i=0; i<imax; ++i) {
315     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
316   }
317 
318   ierr = PetscFree(onodes2);CHKERRQ(ierr);
319   ierr = PetscFree(olengths2);CHKERRQ(ierr);
320 
321   ierr = PetscFree(pa);CHKERRQ(ierr);
322   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
323   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
324   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
325   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
326   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
327   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
328   ierr = PetscFree(table);CHKERRQ(ierr);
329   ierr = PetscFree(s_status);CHKERRQ(ierr);
330   ierr = PetscFree(recv_status);CHKERRQ(ierr);
331   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
332   ierr = PetscFree(xdata);CHKERRQ(ierr);
333   ierr = PetscFree(isz1);CHKERRQ(ierr);
334   PetscFunctionReturn(0);
335 }
336 
337 #undef __FUNCT__
338 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
339 /*
340    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
341        the work on the local processor.
342 
343      Inputs:
344       C      - MAT_MPIAIJ;
345       imax - total no of index sets processed at a time;
346       table  - an array of char - size = m bits.
347 
348      Output:
349       isz    - array containing the count of the solution elements correspondign
350                to each index set;
351       data   - pointer to the solutions
352 */
353 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
354 {
355   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
356   Mat        A = c->A,B = c->B;
357   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
358   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
359   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
360   PetscBT    table_i;
361 
362   PetscFunctionBegin;
363   rstart = C->rmap->rstart;
364   cstart = C->cmap->rstart;
365   ai     = a->i;
366   aj     = a->j;
367   bi     = b->i;
368   bj     = b->j;
369   garray = c->garray;
370 
371 
372   for (i=0; i<imax; i++) {
373     data_i  = data[i];
374     table_i = table[i];
375     isz_i   = isz[i];
376     for (j=0,max=isz[i]; j<max; j++) {
377       row   = data_i[j] - rstart;
378       start = ai[row];
379       end   = ai[row+1];
380       for (k=start; k<end; k++) { /* Amat */
381         val = aj[k] + cstart;
382         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
383       }
384       start = bi[row];
385       end   = bi[row+1];
386       for (k=start; k<end; k++) { /* Bmat */
387         val = garray[bj[k]];
388         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
389       }
390     }
391     isz[i] = isz_i;
392   }
393   PetscFunctionReturn(0);
394 }
395 
396 #undef __FUNCT__
397 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
398 /*
399       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
400          and return the output
401 
402          Input:
403            C    - the matrix
404            nrqr - no of messages being processed.
405            rbuf - an array of pointers to the recieved requests
406 
407          Output:
408            xdata - array of messages to be sent back
409            isz1  - size of each message
410 
411   For better efficiency perhaps we should malloc separately each xdata[i],
412 then if a remalloc is required we need only copy the data for that one row
413 rather then all previous rows as it is now where a single large chunck of
414 memory is used.
415 
416 */
417 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
418 {
419   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
420   Mat            A = c->A,B = c->B;
421   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
422   PetscErrorCode ierr;
423   PetscMPIInt    rank;
424   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
425   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
426   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
427   PetscInt       *rbuf_i,kmax,rbuf_0;
428   PetscBT        xtable;
429 
430   PetscFunctionBegin;
431   rank   = c->rank;
432   m      = C->rmap->N;
433   rstart = C->rmap->rstart;
434   cstart = C->cmap->rstart;
435   ai     = a->i;
436   aj     = a->j;
437   bi     = b->i;
438   bj     = b->j;
439   garray = c->garray;
440 
441 
442   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
443     rbuf_i  =  rbuf[i];
444     rbuf_0  =  rbuf_i[0];
445     ct     += rbuf_0;
446     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
447   }
448 
449   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
450   else      max1 = 1;
451   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
452   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
453   ++no_malloc;
454   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
455   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
456 
457   ct3 = 0;
458   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
459     rbuf_i =  rbuf[i];
460     rbuf_0 =  rbuf_i[0];
461     ct1    =  2*rbuf_0+1;
462     ct2    =  ct1;
463     ct3    += ct1;
464     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
465       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
466       oct2 = ct2;
467       kmax = rbuf_i[2*j];
468       for (k=0; k<kmax; k++,ct1++) {
469         row = rbuf_i[ct1];
470         if (!PetscBTLookupSet(xtable,row)) {
471           if (!(ct3 < mem_estimate)) {
472             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
473             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
474             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
475             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
476             xdata[0]     = tmp;
477             mem_estimate = new_estimate; ++no_malloc;
478             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
479           }
480           xdata[i][ct2++] = row;
481           ct3++;
482         }
483       }
484       for (k=oct2,max2=ct2; k<max2; k++) {
485         row   = xdata[i][k] - rstart;
486         start = ai[row];
487         end   = ai[row+1];
488         for (l=start; l<end; l++) {
489           val = aj[l] + cstart;
490           if (!PetscBTLookupSet(xtable,val)) {
491             if (!(ct3 < mem_estimate)) {
492               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
493               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
494               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
495               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
496               xdata[0]     = tmp;
497               mem_estimate = new_estimate; ++no_malloc;
498               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
499             }
500             xdata[i][ct2++] = val;
501             ct3++;
502           }
503         }
504         start = bi[row];
505         end   = bi[row+1];
506         for (l=start; l<end; l++) {
507           val = garray[bj[l]];
508           if (!PetscBTLookupSet(xtable,val)) {
509             if (!(ct3 < mem_estimate)) {
510               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
511               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
512               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
513               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
514               xdata[0]     = tmp;
515               mem_estimate = new_estimate; ++no_malloc;
516               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
517             }
518             xdata[i][ct2++] = val;
519             ct3++;
520           }
521         }
522       }
523       /* Update the header*/
524       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
525       xdata[i][2*j-1] = rbuf_i[2*j-1];
526     }
527     xdata[i][0] = rbuf_0;
528     xdata[i+1]  = xdata[i] + ct2;
529     isz1[i]     = ct2; /* size of each message */
530   }
531   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
532   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
533   PetscFunctionReturn(0);
534 }
535 /* -------------------------------------------------------------------------*/
536 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
537 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
538 /*
539     Every processor gets the entire matrix
540 */
541 #undef __FUNCT__
542 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
543 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
544 {
545   Mat            B;
546   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
547   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
548   PetscErrorCode ierr;
549   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
550   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
551   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
552   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
553 
554   PetscFunctionBegin;
555   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
556   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
557 
558   if (scall == MAT_INITIAL_MATRIX) {
559     /* ----------------------------------------------------------------
560          Tell every processor the number of nonzeros per row
561     */
562     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
563     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
564       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
565     }
566     sendcount = A->rmap->rend - A->rmap->rstart;
567     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
568     for (i=0; i<size; i++) {
569       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
570       displs[i]     = A->rmap->range[i];
571     }
572 #if defined(PETSC_HAVE_MPI_IN_PLACE)
573     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
574 #else
575     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
576 #endif
577     /* ---------------------------------------------------------------
578          Create the sequential matrix of the same type as the local block diagonal
579     */
580     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
581     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
582     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
583     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
584     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
585     **Bin = B;
586     b = (Mat_SeqAIJ *)B->data;
587 
588     /*--------------------------------------------------------------------
589        Copy my part of matrix column indices over
590     */
591     sendcount  = ad->nz + bd->nz;
592     jsendbuf   = b->j + b->i[rstarts[rank]];
593     a_jsendbuf = ad->j;
594     b_jsendbuf = bd->j;
595     n          = A->rmap->rend - A->rmap->rstart;
596     cnt        = 0;
597     for (i=0; i<n; i++) {
598 
599       /* put in lower diagonal portion */
600       m = bd->i[i+1] - bd->i[i];
601       while (m > 0) {
602         /* is it above diagonal (in bd (compressed) numbering) */
603         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
604         jsendbuf[cnt++] = garray[*b_jsendbuf++];
605         m--;
606       }
607 
608       /* put in diagonal portion */
609       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
610         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
611       }
612 
613       /* put in upper diagonal portion */
614       while (m-- > 0) {
615         jsendbuf[cnt++] = garray[*b_jsendbuf++];
616       }
617     }
618     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
619 
620     /*--------------------------------------------------------------------
621        Gather all column indices to all processors
622     */
623     for (i=0; i<size; i++) {
624       recvcounts[i] = 0;
625       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
626         recvcounts[i] += lens[j];
627       }
628     }
629     displs[0]  = 0;
630     for (i=1; i<size; i++) {
631       displs[i] = displs[i-1] + recvcounts[i-1];
632     }
633 #if defined(PETSC_HAVE_MPI_IN_PLACE)
634     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
635 #else
636     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
637 #endif
638     /*--------------------------------------------------------------------
639         Assemble the matrix into useable form (note numerical values not yet set)
640     */
641     /* set the b->ilen (length of each row) values */
642     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
643     /* set the b->i indices */
644     b->i[0] = 0;
645     for (i=1; i<=A->rmap->N; i++) {
646       b->i[i] = b->i[i-1] + lens[i-1];
647     }
648     ierr = PetscFree(lens);CHKERRQ(ierr);
649     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
650     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
651 
652   } else {
653     B  = **Bin;
654     b = (Mat_SeqAIJ *)B->data;
655   }
656 
657   /*--------------------------------------------------------------------
658        Copy my part of matrix numerical values into the values location
659   */
660   if (flag == MAT_GET_VALUES){
661     sendcount = ad->nz + bd->nz;
662     sendbuf   = b->a + b->i[rstarts[rank]];
663     a_sendbuf = ad->a;
664     b_sendbuf = bd->a;
665     b_sendj   = bd->j;
666     n         = A->rmap->rend - A->rmap->rstart;
667     cnt       = 0;
668     for (i=0; i<n; i++) {
669 
670       /* put in lower diagonal portion */
671       m = bd->i[i+1] - bd->i[i];
672       while (m > 0) {
673         /* is it above diagonal (in bd (compressed) numbering) */
674         if (garray[*b_sendj] > A->rmap->rstart + i) break;
675         sendbuf[cnt++] = *b_sendbuf++;
676         m--;
677         b_sendj++;
678       }
679 
680       /* put in diagonal portion */
681       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
682         sendbuf[cnt++] = *a_sendbuf++;
683       }
684 
685       /* put in upper diagonal portion */
686       while (m-- > 0) {
687         sendbuf[cnt++] = *b_sendbuf++;
688         b_sendj++;
689       }
690     }
691     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
692 
693     /* -----------------------------------------------------------------
694        Gather all numerical values to all processors
695     */
696     if (!recvcounts) {
697       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
698     }
699     for (i=0; i<size; i++) {
700       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
701     }
702     displs[0]  = 0;
703     for (i=1; i<size; i++) {
704       displs[i] = displs[i-1] + recvcounts[i-1];
705     }
706     recvbuf   = b->a;
707 #if defined(PETSC_HAVE_MPI_IN_PLACE)
708     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
709 #else
710     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
711 #endif
712   }  /* endof (flag == MAT_GET_VALUES) */
713   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
714 
715   if (A->symmetric){
716     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
717   } else if (A->hermitian) {
718     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
719   } else if (A->structurally_symmetric) {
720     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
721   }
722   PetscFunctionReturn(0);
723 }
724 
725 
726 
727 #undef __FUNCT__
728 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
729 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
730 {
731   PetscErrorCode ierr;
732   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
733   PetscBool      rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
734 
735   PetscFunctionBegin;
736   /*
737        Check for special case each processor gets entire matrix
738   */
739   if (ismax == 1 && C->rmap->N == C->cmap->N) {
740     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
741     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
742     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
743     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
744     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
745       wantallmatrix = PETSC_TRUE;
746       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
747     }
748   }
749   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
750   if (twantallmatrix) {
751     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
752     PetscFunctionReturn(0);
753   }
754 
755   /* Allocate memory to hold all the submatrices */
756   if (scall != MAT_REUSE_MATRIX) {
757     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
758   }
759   /* Determine the number of stages through which submatrices are done */
760   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
761   /*
762      Each stage will extract nmax submatrices.
763      nmax is determined by the matrix column dimension.
764      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
765   */
766   if (!nmax) nmax = 1;
767   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
768 
769   /* Make sure every processor loops through the nstages */
770   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
771 
772   for (i=0,pos=0; i<nstages; i++) {
773     if (pos+nmax <= ismax) max_no = nmax;
774     else if (pos == ismax) max_no = 0;
775     else                   max_no = ismax-pos;
776     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
777     pos += max_no;
778   }
779   PetscFunctionReturn(0);
780 }
781 
782 
783 
784 
785 /* -------------------------------------------------------------------------*/
786 #undef __FUNCT__
787 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
788 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
789 {
790   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
791   Mat            A = c->A;
792   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
793   const PetscInt **icol,**irow;
794   PetscInt       *nrow,*ncol,start;
795   PetscErrorCode ierr;
796   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
797   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
798   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
799   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
800   PetscInt       **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
801   const PetscInt *irow_i;
802   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
803   PetscInt       *rmap_i;
804   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
805   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
806   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
807   MPI_Status     *r_status3,*r_status4,*s_status4;
808   MPI_Comm       comm;
809   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
810   PetscBool      sorted;
811   PetscMPIInt    *onodes1,*olengths1;
812   PetscMPIInt    idex,idex2,end;
813 
814   PetscFunctionBegin;
815   comm   = ((PetscObject)C)->comm;
816   tag0   = ((PetscObject)C)->tag;
817   size   = c->size;
818   rank   = c->rank;
819 
820   /* Get some new tags to keep the communication clean */
821   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
822   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
823   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
824 
825     /* Check if the col indices are sorted */
826   for (i=0; i<ismax; i++) {
827     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
828     /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
829     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
830     /*    if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
831   }
832   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
833 
834   for (i=0; i<ismax; i++) {
835     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
836     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
837     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
838     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
839   }
840 
841   /* evaluate communication - mesg to who, length of mesg, and buffer space
842      required. Based on this, buffers are allocated, and data copied into them*/
843   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
844   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
845   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
846   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
847   for (i=0; i<ismax; i++) {
848     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
849     jmax   = nrow[i];
850     irow_i = irow[i];
851     for (j=0; j<jmax; j++) {
852       l = 0;
853       row  = irow_i[j];
854       while (row >= C->rmap->range[l+1]) l++;
855       proc = l;
856       w4[proc]++;
857     }
858     for (j=0; j<size; j++) {
859       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
860     }
861   }
862 
863   nrqs     = 0;              /* no of outgoing messages */
864   msz      = 0;              /* total mesg length (for all procs) */
865   w1[rank] = 0;              /* no mesg sent to self */
866   w3[rank] = 0;
867   for (i=0; i<size; i++) {
868     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
869   }
870   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
871   for (i=0,j=0; i<size; i++) {
872     if (w1[i]) { pa[j] = i; j++; }
873   }
874 
875   /* Each message would have a header = 1 + 2*(no of IS) + data */
876   for (i=0; i<nrqs; i++) {
877     j     = pa[i];
878     w1[j] += w2[j] + 2* w3[j];
879     msz   += w1[j];
880   }
881   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
882 
883   /* Determine the number of messages to expect, their lengths, from from-ids */
884   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
885   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
886 
887   /* Now post the Irecvs corresponding to these messages */
888   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
889 
890   ierr = PetscFree(onodes1);CHKERRQ(ierr);
891   ierr = PetscFree(olengths1);CHKERRQ(ierr);
892 
893   /* Allocate Memory for outgoing messages */
894   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
895   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
896   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
897 
898   {
899     PetscInt *iptr = tmp,ict = 0;
900     for (i=0; i<nrqs; i++) {
901       j         = pa[i];
902       iptr     += ict;
903       sbuf1[j]  = iptr;
904       ict       = w1[j];
905     }
906   }
907 
908   /* Form the outgoing messages */
909   /* Initialize the header space */
910   for (i=0; i<nrqs; i++) {
911     j           = pa[i];
912     sbuf1[j][0] = 0;
913     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
914     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
915   }
916 
917   /* Parse the isrow and copy data into outbuf */
918   for (i=0; i<ismax; i++) {
919     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
920     irow_i = irow[i];
921     jmax   = nrow[i];
922     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
923       l = 0;
924       row  = irow_i[j];
925       while (row >= C->rmap->range[l+1]) l++;
926       proc = l;
927       if (proc != rank) { /* copy to the outgoing buf*/
928         ctr[proc]++;
929         *ptr[proc] = row;
930         ptr[proc]++;
931       }
932     }
933     /* Update the headers for the current IS */
934     for (j=0; j<size; j++) { /* Can Optimise this loop too */
935       if ((ctr_j = ctr[j])) {
936         sbuf1_j        = sbuf1[j];
937         k              = ++sbuf1_j[0];
938         sbuf1_j[2*k]   = ctr_j;
939         sbuf1_j[2*k-1] = i;
940       }
941     }
942   }
943 
944   /*  Now  post the sends */
945   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
946   for (i=0; i<nrqs; ++i) {
947     j    = pa[i];
948     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
949   }
950 
951   /* Post Receives to capture the buffer size */
952   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
953   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
954   rbuf2[0] = tmp + msz;
955   for (i=1; i<nrqs; ++i) {
956     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
957   }
958   for (i=0; i<nrqs; ++i) {
959     j    = pa[i];
960     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
961   }
962 
963   /* Send to other procs the buf size they should allocate */
964 
965 
966   /* Receive messages*/
967   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
968   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
969   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
970   {
971     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
972     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
973     PetscInt    *sbuf2_i;
974 
975     for (i=0; i<nrqr; ++i) {
976       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
977       req_size[idex] = 0;
978       rbuf1_i         = rbuf1[idex];
979       start           = 2*rbuf1_i[0] + 1;
980       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
981       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
982       sbuf2_i         = sbuf2[idex];
983       for (j=start; j<end; j++) {
984         id               = rbuf1_i[j] - rstart;
985         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
986         sbuf2_i[j]       = ncols;
987         req_size[idex] += ncols;
988       }
989       req_source[idex] = r_status1[i].MPI_SOURCE;
990       /* form the header */
991       sbuf2_i[0]   = req_size[idex];
992       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
993       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
994     }
995   }
996   ierr = PetscFree(r_status1);CHKERRQ(ierr);
997   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
998 
999   /*  recv buffer sizes */
1000   /* Receive messages*/
1001 
1002   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1003   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1004   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1005   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1006   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1007 
1008   for (i=0; i<nrqs; ++i) {
1009     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1010     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1011     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1012     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1013     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1014   }
1015   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1016   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1017 
1018   /* Wait on sends1 and sends2 */
1019   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1020   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1021 
1022   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1023   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1024   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1025   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1026   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1027   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1028 
1029   /* Now allocate buffers for a->j, and send them off */
1030   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1031   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1032   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1033   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1034 
1035   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1036   {
1037     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1038     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1039     PetscInt cend = C->cmap->rend;
1040     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1041 
1042     for (i=0; i<nrqr; i++) {
1043       rbuf1_i   = rbuf1[i];
1044       sbuf_aj_i = sbuf_aj[i];
1045       ct1       = 2*rbuf1_i[0] + 1;
1046       ct2       = 0;
1047       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1048         kmax = rbuf1[i][2*j];
1049         for (k=0; k<kmax; k++,ct1++) {
1050           row    = rbuf1_i[ct1] - rstart;
1051           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1052           ncols  = nzA + nzB;
1053           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1054 
1055           /* load the column indices for this row into cols*/
1056           cols  = sbuf_aj_i + ct2;
1057 
1058 	  lwrite = 0;
1059           for (l=0; l<nzB; l++) {
1060             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1061           }
1062           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1063           for (l=0; l<nzB; l++) {
1064             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1065           }
1066 
1067           ct2 += ncols;
1068         }
1069       }
1070       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1071     }
1072   }
1073   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1074   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1075 
1076   /* Allocate buffers for a->a, and send them off */
1077   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1078   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1079   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1080   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1081 
1082   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1083   {
1084     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1085     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1086     PetscInt    cend = C->cmap->rend;
1087     PetscInt    *b_j = b->j;
1088     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1089 
1090     for (i=0; i<nrqr; i++) {
1091       rbuf1_i   = rbuf1[i];
1092       sbuf_aa_i = sbuf_aa[i];
1093       ct1       = 2*rbuf1_i[0]+1;
1094       ct2       = 0;
1095       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1096         kmax = rbuf1_i[2*j];
1097         for (k=0; k<kmax; k++,ct1++) {
1098           row    = rbuf1_i[ct1] - rstart;
1099           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1100           ncols  = nzA + nzB;
1101           cworkB = b_j + b_i[row];
1102           vworkA = a_a + a_i[row];
1103           vworkB = b_a + b_i[row];
1104 
1105           /* load the column values for this row into vals*/
1106           vals  = sbuf_aa_i+ct2;
1107 
1108 	  lwrite = 0;
1109           for (l=0; l<nzB; l++) {
1110             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1111           }
1112           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1113           for (l=0; l<nzB; l++) {
1114             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1115           }
1116 
1117           ct2 += ncols;
1118         }
1119       }
1120       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1121     }
1122   }
1123   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1124   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1125   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1126   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1127 
1128   /* Form the matrix */
1129   /* create col map */
1130   {
1131     const PetscInt *icol_i;
1132 
1133     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1134     ierr    = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1135     ierr    = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1136     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1137     for (i=0; i<ismax; i++) {
1138       jmax   = ncol[i];
1139       icol_i = icol[i];
1140       cmap_i = cmap[i];
1141       for (j=0; j<jmax; j++) {
1142         cmap_i[icol_i[j]] = j+1;
1143       }
1144     }
1145   }
1146 
1147   /* Create lens which is required for MatCreate... */
1148   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1149   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1150   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1151   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1152   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1153 
1154   /* Update lens from local data */
1155   for (i=0; i<ismax; i++) {
1156     jmax   = nrow[i];
1157     cmap_i = cmap[i];
1158     irow_i = irow[i];
1159     lens_i = lens[i];
1160     for (j=0; j<jmax; j++) {
1161       l = 0;
1162       row  = irow_i[j];
1163       while (row >= C->rmap->range[l+1]) l++;
1164       proc = l;
1165       if (proc == rank) {
1166         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1167         for (k=0; k<ncols; k++) {
1168           if (cmap_i[cols[k]]) { lens_i[j]++;}
1169         }
1170         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1171       }
1172     }
1173   }
1174 
1175   /* Create row map*/
1176   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1177   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1178   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1179   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1180   for (i=0; i<ismax; i++) {
1181     rmap_i = rmap[i];
1182     irow_i = irow[i];
1183     jmax   = nrow[i];
1184     for (j=0; j<jmax; j++) {
1185       rmap_i[irow_i[j]] = j;
1186     }
1187   }
1188 
1189   /* Update lens from offproc data */
1190   {
1191     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1192 
1193     for (tmp2=0; tmp2<nrqs; tmp2++) {
1194       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1195       idex   = pa[idex2];
1196       sbuf1_i = sbuf1[idex];
1197       jmax    = sbuf1_i[0];
1198       ct1     = 2*jmax+1;
1199       ct2     = 0;
1200       rbuf2_i = rbuf2[idex2];
1201       rbuf3_i = rbuf3[idex2];
1202       for (j=1; j<=jmax; j++) {
1203         is_no   = sbuf1_i[2*j-1];
1204         max1    = sbuf1_i[2*j];
1205         lens_i  = lens[is_no];
1206         cmap_i  = cmap[is_no];
1207         rmap_i  = rmap[is_no];
1208         for (k=0; k<max1; k++,ct1++) {
1209           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1210           max2 = rbuf2_i[ct1];
1211           for (l=0; l<max2; l++,ct2++) {
1212             if (cmap_i[rbuf3_i[ct2]]) {
1213               lens_i[row]++;
1214             }
1215           }
1216         }
1217       }
1218     }
1219   }
1220   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1221   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1222   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1223   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1224   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1225 
1226   /* Create the submatrices */
1227   if (scall == MAT_REUSE_MATRIX) {
1228     PetscBool  flag;
1229 
1230     /*
1231         Assumes new rows are same length as the old rows,hence bug!
1232     */
1233     for (i=0; i<ismax; i++) {
1234       mat = (Mat_SeqAIJ *)(submats[i]->data);
1235       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1236       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1237       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1238       /* Initial matrix as if empty */
1239       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1240       submats[i]->factortype = C->factortype;
1241     }
1242   } else {
1243     for (i=0; i<ismax; i++) {
1244       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1245       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1246       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1247       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1248     }
1249   }
1250 
1251   /* Assemble the matrices */
1252   /* First assemble the local rows */
1253   {
1254     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1255     PetscScalar *imat_a;
1256 
1257     for (i=0; i<ismax; i++) {
1258       mat       = (Mat_SeqAIJ*)submats[i]->data;
1259       imat_ilen = mat->ilen;
1260       imat_j    = mat->j;
1261       imat_i    = mat->i;
1262       imat_a    = mat->a;
1263       cmap_i    = cmap[i];
1264       rmap_i    = rmap[i];
1265       irow_i    = irow[i];
1266       jmax      = nrow[i];
1267       for (j=0; j<jmax; j++) {
1268 	l = 0;
1269         row      = irow_i[j];
1270         while (row >= C->rmap->range[l+1]) l++;
1271         proc = l;
1272         if (proc == rank) {
1273           old_row  = row;
1274           row      = rmap_i[row];
1275           ilen_row = imat_ilen[row];
1276           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1277           mat_i    = imat_i[row] ;
1278           mat_a    = imat_a + mat_i;
1279           mat_j    = imat_j + mat_i;
1280           for (k=0; k<ncols; k++) {
1281             if ((tcol = cmap_i[cols[k]])) {
1282               *mat_j++ = tcol - 1;
1283               *mat_a++ = vals[k];
1284               ilen_row++;
1285             }
1286           }
1287           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1288           imat_ilen[row] = ilen_row;
1289         }
1290       }
1291     }
1292   }
1293 
1294   /*   Now assemble the off proc rows*/
1295   {
1296     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1297     PetscInt    *imat_j,*imat_i;
1298     PetscScalar *imat_a,*rbuf4_i;
1299 
1300     for (tmp2=0; tmp2<nrqs; tmp2++) {
1301       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1302       idex   = pa[idex2];
1303       sbuf1_i = sbuf1[idex];
1304       jmax    = sbuf1_i[0];
1305       ct1     = 2*jmax + 1;
1306       ct2     = 0;
1307       rbuf2_i = rbuf2[idex2];
1308       rbuf3_i = rbuf3[idex2];
1309       rbuf4_i = rbuf4[idex2];
1310       for (j=1; j<=jmax; j++) {
1311         is_no     = sbuf1_i[2*j-1];
1312         rmap_i    = rmap[is_no];
1313         cmap_i    = cmap[is_no];
1314         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1315         imat_ilen = mat->ilen;
1316         imat_j    = mat->j;
1317         imat_i    = mat->i;
1318         imat_a    = mat->a;
1319         max1      = sbuf1_i[2*j];
1320         for (k=0; k<max1; k++,ct1++) {
1321           row   = sbuf1_i[ct1];
1322           row   = rmap_i[row];
1323           ilen  = imat_ilen[row];
1324           mat_i = imat_i[row] ;
1325           mat_a = imat_a + mat_i;
1326           mat_j = imat_j + mat_i;
1327           max2 = rbuf2_i[ct1];
1328           for (l=0; l<max2; l++,ct2++) {
1329             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1330               *mat_j++ = tcol - 1;
1331               *mat_a++ = rbuf4_i[ct2];
1332               ilen++;
1333             }
1334           }
1335           imat_ilen[row] = ilen;
1336         }
1337       }
1338     }
1339   }
1340   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1341   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1342   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1343   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1344   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1345 
1346   /* Restore the indices */
1347   for (i=0; i<ismax; i++) {
1348     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1349     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1350   }
1351 
1352   /* Destroy allocated memory */
1353   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1354   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1355   ierr = PetscFree(pa);CHKERRQ(ierr);
1356 
1357   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1358   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1359   for (i=0; i<nrqr; ++i) {
1360     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1361   }
1362   for (i=0; i<nrqs; ++i) {
1363     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1364     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1365   }
1366 
1367   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1368   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1369   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1370   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1371   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1372   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1373   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1374 
1375   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1376   ierr = PetscFree(cmap);CHKERRQ(ierr);
1377   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1378   ierr = PetscFree(rmap);CHKERRQ(ierr);
1379   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1380   ierr = PetscFree(lens);CHKERRQ(ierr);
1381 
1382   for (i=0; i<ismax; i++) {
1383     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1384     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1385   }
1386   PetscFunctionReturn(0);
1387 }
1388 
1389 /*
1390  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1391  Be careful not to destroy them elsewhere.
1392  */
1393 #undef __FUNCT__
1394 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1395 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1396 {
1397   /* If making this function public, change the error returned in this function away from _PLIB. */
1398   PetscErrorCode ierr;
1399   Mat_MPIAIJ *aij;
1400   PetscBool seqaij;
1401 
1402   PetscFunctionBegin;
1403   /* Check to make sure the component matrices are compatible with C. */
1404   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1405   if(!seqaij) {
1406     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1407   }
1408   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1409   if(!seqaij) {
1410     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1411   }
1412   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1413     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1414   }
1415   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1416   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1417   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1418   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1419   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1420   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1421   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1422     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1423   }
1424   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1425   aij = (Mat_MPIAIJ*)((*C)->data);
1426   aij->A = A;
1427   aij->B = B;
1428   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1429   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1430   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1431   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 #undef __FUNCT__
1436 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1437 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1438 {
1439   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1440   PetscFunctionBegin;
1441   PetscValidPointer(A,2);
1442   PetscValidPointer(B,3);
1443   *A = aij->A;
1444   *B = aij->B;
1445   /* Note that we don't incref *A and *B, so be careful! */
1446   PetscFunctionReturn(0);
1447 }
1448 
1449 
1450 #undef __FUNCT__
1451 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1452 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1453 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1454 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1455 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1456 {
1457   PetscErrorCode ierr;
1458   PetscMPIInt size, flag;
1459   PetscInt    i,ii;
1460   PetscInt    ismax_c;
1461   PetscFunctionBegin;
1462   if(!ismax) {
1463     PetscFunctionReturn(0);
1464   }
1465   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1466     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1467     if(flag != MPI_IDENT) {
1468       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1469     }
1470     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1471     if(size > 1) {
1472       ++ismax_c;
1473     }
1474   }
1475   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1476     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1477   }
1478   else { /* if(ismax_c) */
1479     Mat *A,*B;
1480     IS  *isrow_c, *iscol_c;
1481     PetscMPIInt size;
1482     /*
1483      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1484      array of sequential matrices underlying the resulting parallel matrices.
1485      Which arrays to allocate is based on the value of MatReuse scall.
1486      There are as many diag matrices as there are original index sets.
1487      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1488 
1489      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1490      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1491      will deposite the extracted diag and off-diag parts.
1492      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1493      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1494     */
1495 
1496     /* Parallel matrix array is allocated only if no reuse is taking place. */
1497     if (scall != MAT_REUSE_MATRIX) {
1498       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1499     }
1500     else {
1501       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1502       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1503       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1504       for(i = 0, ii = 0; i < ismax; ++i) {
1505         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1506         if(size > 1) {
1507           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1508           ++ii;
1509         }
1510         else {
1511           A[i] = (*submat)[i];
1512         }
1513       }
1514     }
1515     /*
1516      Construct the complements of the iscol ISs for parallel ISs only.
1517      These are used to extract the off-diag portion of the resulting parallel matrix.
1518      The row IS for the off-diag portion is the same as for the diag portion,
1519      so we merely alias the row IS, while skipping those that are sequential.
1520     */
1521     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1522     for(i = 0, ii = 0; i < ismax; ++i) {
1523       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1524       if(size > 1) {
1525 	isrow_c[ii] = isrow[i];
1526 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1527 	++ii;
1528       }
1529     }
1530     /* Now obtain the sequential A and B submatrices separately. */
1531     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1532     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1533     for(ii = 0; ii < ismax_c; ++ii) {
1534       ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr);
1535     }
1536     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1537     /*
1538      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1539      have been extracted directly into the parallel matrices containing them, or
1540      simply into the sequential matrix identical with the corresponding A (if size == 1).
1541      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1542      A is put into the correct submat slot (if size == 1).
1543      */
1544     if(scall != MAT_REUSE_MATRIX) {
1545       for(i = 0, ii = 0; i < ismax; ++i) {
1546         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1547         if(size > 1) {
1548           /*
1549            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1550            */
1551           /* Construct submat[i] from the Seq pieces A and B. */
1552           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1553 
1554           ++ii;
1555         }
1556         else {
1557           (*submat)[i] = A[i];
1558         }
1559       }
1560     }
1561     ierr = PetscFree(A); CHKERRQ(ierr);
1562     ierr = PetscFree(B); CHKERRQ(ierr);
1563   }
1564   PetscFunctionReturn(0);
1565 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1566 
1567 
1568 
1569 #undef __FUNCT__
1570 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1571 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1572 {
1573   PetscErrorCode ierr;
1574   PetscFunctionBegin;
1575   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1576                                            MatCreateMPIAIJFromSeqMatrices_Private,
1577                                            MatMPIAIJExtractSeqMatrices_Private);
1578   CHKERRQ(ierr);
1579   PetscFunctionReturn(0);
1580 }
1581