xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision acfcf0e5ba359b58e6a8a7af3f239cd7334278e8)
1 
2 #define PETSCMAT_DLL
3 
4 /*
5    Routines to compute overlapping regions of a parallel MPI matrix
6   and to find submatrices that were shared across processors.
7 */
8 #include "../src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14 EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
19 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20 {
21   PetscErrorCode ierr;
22   PetscInt       i;
23 
24   PetscFunctionBegin;
25   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26   for (i=0; i<ov; ++i) {
27     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
28   }
29   PetscFunctionReturn(0);
30 }
31 
32 /*
33   Sample message format:
34   If a processor A wants processor B to process some elements corresponding
35   to index sets is[1],is[5]
36   mesg [0] = 2   (no of index sets in the mesg)
37   -----------
38   mesg [1] = 1 => is[1]
39   mesg [2] = sizeof(is[1]);
40   -----------
41   mesg [3] = 5  => is[5]
42   mesg [4] = sizeof(is[5]);
43   -----------
44   mesg [5]
45   mesg [n]  datas[1]
46   -----------
47   mesg[n+1]
48   mesg[m]  data(is[5])
49   -----------
50 
51   Notes:
52   nrqs - no of requests sent (or to be sent out)
53   nrqr - no of requests recieved (which have to be or which have been processed
54 */
55 #undef __FUNCT__
56 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
57 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
58 {
59   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
60   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
61   const PetscInt **idx,*idx_i;
62   PetscInt       *n,*rtable,**data,len;
63   PetscErrorCode ierr;
64   PetscMPIInt    size,rank,tag1,tag2;
65   PetscInt       m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
66   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
67   PetscBT        *table;
68   MPI_Comm       comm;
69   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
70   MPI_Status     *s_status,*recv_status;
71 
72   PetscFunctionBegin;
73   comm   = ((PetscObject)C)->comm;
74   size   = c->size;
75   rank   = c->rank;
76   m      = C->rmap->N;
77 
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
79   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
80 
81   ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr);
82 
83   for (i=0; i<imax; i++) {
84     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
85     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
86   }
87 
88   /* Create hash table for the mapping :row -> proc*/
89   for (i=0,j=0; i<size; i++) {
90     len = C->rmap->range[i+1];
91     for (; j<len; j++) {
92       rtable[j] = i;
93     }
94   }
95 
96   /* evaluate communication - mesg to who,length of mesg, and buffer space
97      required. Based on this, buffers are allocated, and data copied into them*/
98   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
99   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
100   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
101   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
102   for (i=0; i<imax; i++) {
103     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
104     idx_i = idx[i];
105     len   = n[i];
106     for (j=0; j<len; j++) {
107       row  = idx_i[j];
108       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
109       proc = rtable[row];
110       w4[proc]++;
111     }
112     for (j=0; j<size; j++){
113       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
114     }
115   }
116 
117   nrqs     = 0;              /* no of outgoing messages */
118   msz      = 0;              /* total mesg length (for all proc */
119   w1[rank] = 0;              /* no mesg sent to intself */
120   w3[rank] = 0;
121   for (i=0; i<size; i++) {
122     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
123   }
124   /* pa - is list of processors to communicate with */
125   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
126   for (i=0,j=0; i<size; i++) {
127     if (w1[i]) {pa[j] = i; j++;}
128   }
129 
130   /* Each message would have a header = 1 + 2*(no of IS) + data */
131   for (i=0; i<nrqs; i++) {
132     j      = pa[i];
133     w1[j] += w2[j] + 2*w3[j];
134     msz   += w1[j];
135   }
136 
137   /* Determine the number of messages to expect, their lengths, from from-ids */
138   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
139   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
140 
141   /* Now post the Irecvs corresponding to these messages */
142   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
143 
144   /* Allocate Memory for outgoing messages */
145   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
146   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
147   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
148 
149   {
150     PetscInt *iptr = tmp,ict  = 0;
151     for (i=0; i<nrqs; i++) {
152       j         = pa[i];
153       iptr     +=  ict;
154       outdat[j] = iptr;
155       ict       = w1[j];
156     }
157   }
158 
159   /* Form the outgoing messages */
160   /*plug in the headers*/
161   for (i=0; i<nrqs; i++) {
162     j            = pa[i];
163     outdat[j][0] = 0;
164     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
165     ptr[j]       = outdat[j] + 2*w3[j] + 1;
166   }
167 
168   /* Memory for doing local proc's work*/
169   {
170     PetscInt  *d_p;
171     char      *t_p;
172 
173     /* should replace with PetscMallocN() */
174     ierr  = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
175       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr);
176     ierr  = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
177       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr);
178     data  = (PetscInt **)(table + imax);
179     isz   = (PetscInt  *)(data  + imax);
180     d_p   = (PetscInt  *)(isz   + imax);
181     t_p   = (char *)(d_p   + m*imax);
182     for (i=0; i<imax; i++) {
183       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
184       data[i]  = d_p + (m)*i;
185     }
186   }
187 
188   /* Parse the IS and update local tables and the outgoing buf with the data*/
189   {
190     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
191     PetscBT table_i;
192 
193     for (i=0; i<imax; i++) {
194       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
195       n_i     = n[i];
196       table_i = table[i];
197       idx_i   = idx[i];
198       data_i  = data[i];
199       isz_i   = isz[i];
200       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
201         row  = idx_i[j];
202         proc = rtable[row];
203         if (proc != rank) { /* copy to the outgoing buffer */
204           ctr[proc]++;
205           *ptr[proc] = row;
206           ptr[proc]++;
207         } else { /* Update the local table */
208           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
209         }
210       }
211       /* Update the headers for the current IS */
212       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
213         if ((ctr_j = ctr[j])) {
214           outdat_j        = outdat[j];
215           k               = ++outdat_j[0];
216           outdat_j[2*k]   = ctr_j;
217           outdat_j[2*k-1] = i;
218         }
219       }
220       isz[i] = isz_i;
221     }
222   }
223 
224   /*  Now  post the sends */
225   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
226   for (i=0; i<nrqs; ++i) {
227     j    = pa[i];
228     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
229   }
230 
231   /* No longer need the original indices*/
232   for (i=0; i<imax; ++i) {
233     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
234   }
235   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
236 
237   for (i=0; i<imax; ++i) {
238     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
239   }
240 
241   /* Do Local work*/
242   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
243 
244   /* Receive messages*/
245   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
246   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
247 
248   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
249   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
250 
251   /* Phase 1 sends are complete - deallocate buffers */
252   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
253   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
254 
255   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
256   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
257   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
258   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
259   ierr = PetscFree(rbuf);CHKERRQ(ierr);
260 
261 
262  /* Send the data back*/
263   /* Do a global reduction to know the buffer space req for incoming messages*/
264   {
265     PetscMPIInt *rw1;
266 
267     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
268     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
269 
270     for (i=0; i<nrqr; ++i) {
271       proc      = recv_status[i].MPI_SOURCE;
272       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
273       rw1[proc] = isz1[i];
274     }
275     ierr = PetscFree(onodes1);CHKERRQ(ierr);
276     ierr = PetscFree(olengths1);CHKERRQ(ierr);
277 
278     /* Determine the number of messages to expect, their lengths, from from-ids */
279     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
280     ierr = PetscFree(rw1);CHKERRQ(ierr);
281   }
282   /* Now post the Irecvs corresponding to these messages */
283   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
284 
285   /*  Now  post the sends */
286   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
287   for (i=0; i<nrqr; ++i) {
288     j    = recv_status[i].MPI_SOURCE;
289     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
290   }
291 
292   /* receive work done on other processors*/
293   {
294     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
295     PetscMPIInt idex;
296     PetscBT     table_i;
297     MPI_Status  *status2;
298 
299     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
300     for (i=0; i<nrqs; ++i) {
301       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
302       /* Process the message*/
303       rbuf2_i = rbuf2[idex];
304       ct1     = 2*rbuf2_i[0]+1;
305       jmax    = rbuf2[idex][0];
306       for (j=1; j<=jmax; j++) {
307         max     = rbuf2_i[2*j];
308         is_no   = rbuf2_i[2*j-1];
309         isz_i   = isz[is_no];
310         data_i  = data[is_no];
311         table_i = table[is_no];
312         for (k=0; k<max; k++,ct1++) {
313           row = rbuf2_i[ct1];
314           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
315         }
316         isz[is_no] = isz_i;
317       }
318     }
319 
320     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
321     ierr = PetscFree(status2);CHKERRQ(ierr);
322   }
323 
324   for (i=0; i<imax; ++i) {
325     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
326   }
327 
328   ierr = PetscFree(onodes2);CHKERRQ(ierr);
329   ierr = PetscFree(olengths2);CHKERRQ(ierr);
330 
331   ierr = PetscFree(pa);CHKERRQ(ierr);
332   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
333   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
334   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
335   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
337   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(table);CHKERRQ(ierr);
339   ierr = PetscFree(s_status);CHKERRQ(ierr);
340   ierr = PetscFree(recv_status);CHKERRQ(ierr);
341   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
342   ierr = PetscFree(xdata);CHKERRQ(ierr);
343   ierr = PetscFree(isz1);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
349 /*
350    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
351        the work on the local processor.
352 
353      Inputs:
354       C      - MAT_MPIAIJ;
355       imax - total no of index sets processed at a time;
356       table  - an array of char - size = m bits.
357 
358      Output:
359       isz    - array containing the count of the solution elements correspondign
360                to each index set;
361       data   - pointer to the solutions
362 */
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
364 {
365   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
366   Mat        A = c->A,B = c->B;
367   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
368   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
369   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
370   PetscBT    table_i;
371 
372   PetscFunctionBegin;
373   rstart = C->rmap->rstart;
374   cstart = C->cmap->rstart;
375   ai     = a->i;
376   aj     = a->j;
377   bi     = b->i;
378   bj     = b->j;
379   garray = c->garray;
380 
381 
382   for (i=0; i<imax; i++) {
383     data_i  = data[i];
384     table_i = table[i];
385     isz_i   = isz[i];
386     for (j=0,max=isz[i]; j<max; j++) {
387       row   = data_i[j] - rstart;
388       start = ai[row];
389       end   = ai[row+1];
390       for (k=start; k<end; k++) { /* Amat */
391         val = aj[k] + cstart;
392         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
393       }
394       start = bi[row];
395       end   = bi[row+1];
396       for (k=start; k<end; k++) { /* Bmat */
397         val = garray[bj[k]];
398         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399       }
400     }
401     isz[i] = isz_i;
402   }
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
408 /*
409       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
410          and return the output
411 
412          Input:
413            C    - the matrix
414            nrqr - no of messages being processed.
415            rbuf - an array of pointers to the recieved requests
416 
417          Output:
418            xdata - array of messages to be sent back
419            isz1  - size of each message
420 
421   For better efficiency perhaps we should malloc separately each xdata[i],
422 then if a remalloc is required we need only copy the data for that one row
423 rather then all previous rows as it is now where a single large chunck of
424 memory is used.
425 
426 */
427 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
428 {
429   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
430   Mat            A = c->A,B = c->B;
431   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
432   PetscErrorCode ierr;
433   PetscMPIInt    rank;
434   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
435   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
436   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
437   PetscInt       *rbuf_i,kmax,rbuf_0;
438   PetscBT        xtable;
439 
440   PetscFunctionBegin;
441   rank   = c->rank;
442   m      = C->rmap->N;
443   rstart = C->rmap->rstart;
444   cstart = C->cmap->rstart;
445   ai     = a->i;
446   aj     = a->j;
447   bi     = b->i;
448   bj     = b->j;
449   garray = c->garray;
450 
451 
452   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
453     rbuf_i  =  rbuf[i];
454     rbuf_0  =  rbuf_i[0];
455     ct     += rbuf_0;
456     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
457   }
458 
459   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
460   else      max1 = 1;
461   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
462   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
463   ++no_malloc;
464   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
465   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
466 
467   ct3 = 0;
468   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
469     rbuf_i =  rbuf[i];
470     rbuf_0 =  rbuf_i[0];
471     ct1    =  2*rbuf_0+1;
472     ct2    =  ct1;
473     ct3    += ct1;
474     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
475       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
476       oct2 = ct2;
477       kmax = rbuf_i[2*j];
478       for (k=0; k<kmax; k++,ct1++) {
479         row = rbuf_i[ct1];
480         if (!PetscBTLookupSet(xtable,row)) {
481           if (!(ct3 < mem_estimate)) {
482             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
483             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
484             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
485             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
486             xdata[0]     = tmp;
487             mem_estimate = new_estimate; ++no_malloc;
488             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
489           }
490           xdata[i][ct2++] = row;
491           ct3++;
492         }
493       }
494       for (k=oct2,max2=ct2; k<max2; k++) {
495         row   = xdata[i][k] - rstart;
496         start = ai[row];
497         end   = ai[row+1];
498         for (l=start; l<end; l++) {
499           val = aj[l] + cstart;
500           if (!PetscBTLookupSet(xtable,val)) {
501             if (!(ct3 < mem_estimate)) {
502               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
503               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
504               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
505               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
506               xdata[0]     = tmp;
507               mem_estimate = new_estimate; ++no_malloc;
508               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
509             }
510             xdata[i][ct2++] = val;
511             ct3++;
512           }
513         }
514         start = bi[row];
515         end   = bi[row+1];
516         for (l=start; l<end; l++) {
517           val = garray[bj[l]];
518           if (!PetscBTLookupSet(xtable,val)) {
519             if (!(ct3 < mem_estimate)) {
520               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
521               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
522               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
523               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
524               xdata[0]     = tmp;
525               mem_estimate = new_estimate; ++no_malloc;
526               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
527             }
528             xdata[i][ct2++] = val;
529             ct3++;
530           }
531         }
532       }
533       /* Update the header*/
534       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
535       xdata[i][2*j-1] = rbuf_i[2*j-1];
536     }
537     xdata[i][0] = rbuf_0;
538     xdata[i+1]  = xdata[i] + ct2;
539     isz1[i]     = ct2; /* size of each message */
540   }
541   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
542   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
543   PetscFunctionReturn(0);
544 }
545 /* -------------------------------------------------------------------------*/
546 EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
547 EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
548 /*
549     Every processor gets the entire matrix
550 */
551 #undef __FUNCT__
552 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
553 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
554 {
555   Mat            B;
556   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
557   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
558   PetscErrorCode ierr;
559   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
560   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
561   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
562   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
563 
564   PetscFunctionBegin;
565   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
566   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
567 
568   if (scall == MAT_INITIAL_MATRIX) {
569     /* ----------------------------------------------------------------
570          Tell every processor the number of nonzeros per row
571     */
572     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
573     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
574       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
575     }
576     sendcount = A->rmap->rend - A->rmap->rstart;
577     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
578     for (i=0; i<size; i++) {
579       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
580       displs[i]     = A->rmap->range[i];
581     }
582 #if defined(PETSC_HAVE_MPI_IN_PLACE)
583     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
584 #else
585     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
586 #endif
587     /* ---------------------------------------------------------------
588          Create the sequential matrix of the same type as the local block diagonal
589     */
590     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
591     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
592     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
593     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
594     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
595     **Bin = B;
596     b = (Mat_SeqAIJ *)B->data;
597 
598     /*--------------------------------------------------------------------
599        Copy my part of matrix column indices over
600     */
601     sendcount  = ad->nz + bd->nz;
602     jsendbuf   = b->j + b->i[rstarts[rank]];
603     a_jsendbuf = ad->j;
604     b_jsendbuf = bd->j;
605     n          = A->rmap->rend - A->rmap->rstart;
606     cnt        = 0;
607     for (i=0; i<n; i++) {
608 
609       /* put in lower diagonal portion */
610       m = bd->i[i+1] - bd->i[i];
611       while (m > 0) {
612         /* is it above diagonal (in bd (compressed) numbering) */
613         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
614         jsendbuf[cnt++] = garray[*b_jsendbuf++];
615         m--;
616       }
617 
618       /* put in diagonal portion */
619       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
620         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
621       }
622 
623       /* put in upper diagonal portion */
624       while (m-- > 0) {
625         jsendbuf[cnt++] = garray[*b_jsendbuf++];
626       }
627     }
628     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
629 
630     /*--------------------------------------------------------------------
631        Gather all column indices to all processors
632     */
633     for (i=0; i<size; i++) {
634       recvcounts[i] = 0;
635       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
636         recvcounts[i] += lens[j];
637       }
638     }
639     displs[0]  = 0;
640     for (i=1; i<size; i++) {
641       displs[i] = displs[i-1] + recvcounts[i-1];
642     }
643 #if defined(PETSC_HAVE_MPI_IN_PLACE)
644     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
645 #else
646     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
647 #endif
648     /*--------------------------------------------------------------------
649         Assemble the matrix into useable form (note numerical values not yet set)
650     */
651     /* set the b->ilen (length of each row) values */
652     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
653     /* set the b->i indices */
654     b->i[0] = 0;
655     for (i=1; i<=A->rmap->N; i++) {
656       b->i[i] = b->i[i-1] + lens[i-1];
657     }
658     ierr = PetscFree(lens);CHKERRQ(ierr);
659     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
660     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
661 
662   } else {
663     B  = **Bin;
664     b = (Mat_SeqAIJ *)B->data;
665   }
666 
667   /*--------------------------------------------------------------------
668        Copy my part of matrix numerical values into the values location
669   */
670   if (flag == MAT_GET_VALUES){
671     sendcount = ad->nz + bd->nz;
672     sendbuf   = b->a + b->i[rstarts[rank]];
673     a_sendbuf = ad->a;
674     b_sendbuf = bd->a;
675     b_sendj   = bd->j;
676     n         = A->rmap->rend - A->rmap->rstart;
677     cnt       = 0;
678     for (i=0; i<n; i++) {
679 
680       /* put in lower diagonal portion */
681       m = bd->i[i+1] - bd->i[i];
682       while (m > 0) {
683         /* is it above diagonal (in bd (compressed) numbering) */
684         if (garray[*b_sendj] > A->rmap->rstart + i) break;
685         sendbuf[cnt++] = *b_sendbuf++;
686         m--;
687         b_sendj++;
688       }
689 
690       /* put in diagonal portion */
691       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
692         sendbuf[cnt++] = *a_sendbuf++;
693       }
694 
695       /* put in upper diagonal portion */
696       while (m-- > 0) {
697         sendbuf[cnt++] = *b_sendbuf++;
698         b_sendj++;
699       }
700     }
701     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
702 
703     /* -----------------------------------------------------------------
704        Gather all numerical values to all processors
705     */
706     if (!recvcounts) {
707       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
708     }
709     for (i=0; i<size; i++) {
710       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
711     }
712     displs[0]  = 0;
713     for (i=1; i<size; i++) {
714       displs[i] = displs[i-1] + recvcounts[i-1];
715     }
716     recvbuf   = b->a;
717 #if defined(PETSC_HAVE_MPI_IN_PLACE)
718     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
719 #else
720     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
721 #endif
722   }  /* endof (flag == MAT_GET_VALUES) */
723   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
724 
725   if (A->symmetric){
726     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
727   } else if (A->hermitian) {
728     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
729   } else if (A->structurally_symmetric) {
730     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
731   }
732   PetscFunctionReturn(0);
733 }
734 
735 #undef __FUNCT__
736 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
737 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
738 {
739   PetscErrorCode ierr;
740   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
741   PetscBool      rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
742 
743   PetscFunctionBegin;
744   /*
745        Check for special case each processor gets entire matrix
746   */
747   if (ismax == 1 && C->rmap->N == C->cmap->N) {
748     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
749     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
750     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
751     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
752     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
753       wantallmatrix = PETSC_TRUE;
754       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
755     }
756   }
757   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
758   if (twantallmatrix) {
759     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
760     PetscFunctionReturn(0);
761   }
762 
763   /* Allocate memory to hold all the submatrices */
764   if (scall != MAT_REUSE_MATRIX) {
765     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
766   }
767   /* Determine the number of stages through which submatrices are done */
768   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
769   if (!nmax) nmax = 1;
770   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
771 
772   /* Make sure every processor loops through the nstages */
773   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
774 
775   for (i=0,pos=0; i<nstages; i++) {
776     if (pos+nmax <= ismax) max_no = nmax;
777     else if (pos == ismax) max_no = 0;
778     else                   max_no = ismax-pos;
779     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
780     pos += max_no;
781   }
782   PetscFunctionReturn(0);
783 }
784 
785 /* -------------------------------------------------------------------------*/
786 #undef __FUNCT__
787 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
788 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
789 {
790   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
791   Mat            A = c->A;
792   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
793   const PetscInt **icol,**irow;
794   PetscInt       *nrow,*ncol,start;
795   PetscErrorCode ierr;
796   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
797   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
798   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
799   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
800   PetscInt       **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
801   const PetscInt *irow_i;
802   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
803   PetscInt       *rmap_i;
804   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
805   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
806   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
807   MPI_Status     *r_status3,*r_status4,*s_status4;
808   MPI_Comm       comm;
809   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
810   PetscBool      sorted;
811   PetscMPIInt    *onodes1,*olengths1;
812   PetscMPIInt    idex,idex2,end;
813 
814   PetscFunctionBegin;
815   comm   = ((PetscObject)C)->comm;
816   tag0   = ((PetscObject)C)->tag;
817   size   = c->size;
818   rank   = c->rank;
819 
820   /* Get some new tags to keep the communication clean */
821   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
822   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
823   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
824 
825     /* Check if the col indices are sorted */
826   for (i=0; i<ismax; i++) {
827     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
828     /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
829     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
830     /*    if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
831   }
832   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
833 
834   for (i=0; i<ismax; i++) {
835     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
836     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
837     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
838     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
839   }
840 
841   /* evaluate communication - mesg to who, length of mesg, and buffer space
842      required. Based on this, buffers are allocated, and data copied into them*/
843   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
844   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
845   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
846   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
847   for (i=0; i<ismax; i++) {
848     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
849     jmax   = nrow[i];
850     irow_i = irow[i];
851     for (j=0; j<jmax; j++) {
852       l = 0;
853       row  = irow_i[j];
854       while (row >= C->rmap->range[l+1]) l++;
855       proc = l;
856       w4[proc]++;
857     }
858     for (j=0; j<size; j++) {
859       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
860     }
861   }
862 
863   nrqs     = 0;              /* no of outgoing messages */
864   msz      = 0;              /* total mesg length (for all procs) */
865   w1[rank] = 0;              /* no mesg sent to self */
866   w3[rank] = 0;
867   for (i=0; i<size; i++) {
868     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
869   }
870   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
871   for (i=0,j=0; i<size; i++) {
872     if (w1[i]) { pa[j] = i; j++; }
873   }
874 
875   /* Each message would have a header = 1 + 2*(no of IS) + data */
876   for (i=0; i<nrqs; i++) {
877     j     = pa[i];
878     w1[j] += w2[j] + 2* w3[j];
879     msz   += w1[j];
880   }
881 
882   /* Determine the number of messages to expect, their lengths, from from-ids */
883   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
884   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
885 
886   /* Now post the Irecvs corresponding to these messages */
887   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
888 
889   ierr = PetscFree(onodes1);CHKERRQ(ierr);
890   ierr = PetscFree(olengths1);CHKERRQ(ierr);
891 
892   /* Allocate Memory for outgoing messages */
893   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
894   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
895   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
896 
897   {
898     PetscInt *iptr = tmp,ict = 0;
899     for (i=0; i<nrqs; i++) {
900       j         = pa[i];
901       iptr     += ict;
902       sbuf1[j]  = iptr;
903       ict       = w1[j];
904     }
905   }
906 
907   /* Form the outgoing messages */
908   /* Initialize the header space */
909   for (i=0; i<nrqs; i++) {
910     j           = pa[i];
911     sbuf1[j][0] = 0;
912     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
913     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
914   }
915 
916   /* Parse the isrow and copy data into outbuf */
917   for (i=0; i<ismax; i++) {
918     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
919     irow_i = irow[i];
920     jmax   = nrow[i];
921     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
922       l = 0;
923       row  = irow_i[j];
924       while (row >= C->rmap->range[l+1]) l++;
925       proc = l;
926       if (proc != rank) { /* copy to the outgoing buf*/
927         ctr[proc]++;
928         *ptr[proc] = row;
929         ptr[proc]++;
930       }
931     }
932     /* Update the headers for the current IS */
933     for (j=0; j<size; j++) { /* Can Optimise this loop too */
934       if ((ctr_j = ctr[j])) {
935         sbuf1_j        = sbuf1[j];
936         k              = ++sbuf1_j[0];
937         sbuf1_j[2*k]   = ctr_j;
938         sbuf1_j[2*k-1] = i;
939       }
940     }
941   }
942 
943   /*  Now  post the sends */
944   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
945   for (i=0; i<nrqs; ++i) {
946     j    = pa[i];
947     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
948   }
949 
950   /* Post Receives to capture the buffer size */
951   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
952   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
953   rbuf2[0] = tmp + msz;
954   for (i=1; i<nrqs; ++i) {
955     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
956   }
957   for (i=0; i<nrqs; ++i) {
958     j    = pa[i];
959     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
960   }
961 
962   /* Send to other procs the buf size they should allocate */
963 
964 
965   /* Receive messages*/
966   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
967   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
968   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
969   {
970     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
971     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
972     PetscInt    *sbuf2_i;
973 
974     for (i=0; i<nrqr; ++i) {
975       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
976       req_size[idex] = 0;
977       rbuf1_i         = rbuf1[idex];
978       start           = 2*rbuf1_i[0] + 1;
979       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
980       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
981       sbuf2_i         = sbuf2[idex];
982       for (j=start; j<end; j++) {
983         id               = rbuf1_i[j] - rstart;
984         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
985         sbuf2_i[j]       = ncols;
986         req_size[idex] += ncols;
987       }
988       req_source[idex] = r_status1[i].MPI_SOURCE;
989       /* form the header */
990       sbuf2_i[0]   = req_size[idex];
991       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
992       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
993     }
994   }
995   ierr = PetscFree(r_status1);CHKERRQ(ierr);
996   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
997 
998   /*  recv buffer sizes */
999   /* Receive messages*/
1000 
1001   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1002   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1003   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1004   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1005   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1006 
1007   for (i=0; i<nrqs; ++i) {
1008     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1009     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1010     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1011     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1012     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1013   }
1014   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1015   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1016 
1017   /* Wait on sends1 and sends2 */
1018   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1019   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1020 
1021   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1022   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1023   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1024   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1025   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1026   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1027 
1028   /* Now allocate buffers for a->j, and send them off */
1029   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1030   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1031   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1032   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1033 
1034   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1035   {
1036     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1037     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1038     PetscInt cend = C->cmap->rend;
1039     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1040 
1041     for (i=0; i<nrqr; i++) {
1042       rbuf1_i   = rbuf1[i];
1043       sbuf_aj_i = sbuf_aj[i];
1044       ct1       = 2*rbuf1_i[0] + 1;
1045       ct2       = 0;
1046       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1047         kmax = rbuf1[i][2*j];
1048         for (k=0; k<kmax; k++,ct1++) {
1049           row    = rbuf1_i[ct1] - rstart;
1050           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1051           ncols  = nzA + nzB;
1052           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1053 
1054           /* load the column indices for this row into cols*/
1055           cols  = sbuf_aj_i + ct2;
1056 
1057 	  lwrite = 0;
1058           for (l=0; l<nzB; l++) {
1059             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1060           }
1061           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1062           for (l=0; l<nzB; l++) {
1063             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1064           }
1065 
1066           ct2 += ncols;
1067         }
1068       }
1069       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1070     }
1071   }
1072   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1073   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1074 
1075   /* Allocate buffers for a->a, and send them off */
1076   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1077   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1078   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1079   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1080 
1081   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1082   {
1083     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1084     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1085     PetscInt    cend = C->cmap->rend;
1086     PetscInt    *b_j = b->j;
1087     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1088 
1089     for (i=0; i<nrqr; i++) {
1090       rbuf1_i   = rbuf1[i];
1091       sbuf_aa_i = sbuf_aa[i];
1092       ct1       = 2*rbuf1_i[0]+1;
1093       ct2       = 0;
1094       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1095         kmax = rbuf1_i[2*j];
1096         for (k=0; k<kmax; k++,ct1++) {
1097           row    = rbuf1_i[ct1] - rstart;
1098           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1099           ncols  = nzA + nzB;
1100           cworkB = b_j + b_i[row];
1101           vworkA = a_a + a_i[row];
1102           vworkB = b_a + b_i[row];
1103 
1104           /* load the column values for this row into vals*/
1105           vals  = sbuf_aa_i+ct2;
1106 
1107 	  lwrite = 0;
1108           for (l=0; l<nzB; l++) {
1109             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1110           }
1111           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1112           for (l=0; l<nzB; l++) {
1113             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1114           }
1115 
1116           ct2 += ncols;
1117         }
1118       }
1119       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1120     }
1121   }
1122   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1123   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1124   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1125   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1126 
1127   /* Form the matrix */
1128   /* create col map */
1129   {
1130     const PetscInt *icol_i;
1131 
1132     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1133     ierr    = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1134     ierr    = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1135     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1136     for (i=0; i<ismax; i++) {
1137       jmax   = ncol[i];
1138       icol_i = icol[i];
1139       cmap_i = cmap[i];
1140       for (j=0; j<jmax; j++) {
1141         cmap_i[icol_i[j]] = j+1;
1142       }
1143     }
1144   }
1145 
1146   /* Create lens which is required for MatCreate... */
1147   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1148   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1149   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1150   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1151   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1152 
1153   /* Update lens from local data */
1154   for (i=0; i<ismax; i++) {
1155     jmax   = nrow[i];
1156     cmap_i = cmap[i];
1157     irow_i = irow[i];
1158     lens_i = lens[i];
1159     for (j=0; j<jmax; j++) {
1160       l = 0;
1161       row  = irow_i[j];
1162       while (row >= C->rmap->range[l+1]) l++;
1163       proc = l;
1164       if (proc == rank) {
1165         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1166         for (k=0; k<ncols; k++) {
1167           if (cmap_i[cols[k]]) { lens_i[j]++;}
1168         }
1169         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1170       }
1171     }
1172   }
1173 
1174   /* Create row map*/
1175   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1176   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1177   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1178   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1179   for (i=0; i<ismax; i++) {
1180     rmap_i = rmap[i];
1181     irow_i = irow[i];
1182     jmax   = nrow[i];
1183     for (j=0; j<jmax; j++) {
1184       rmap_i[irow_i[j]] = j;
1185     }
1186   }
1187 
1188   /* Update lens from offproc data */
1189   {
1190     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1191 
1192     for (tmp2=0; tmp2<nrqs; tmp2++) {
1193       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1194       idex   = pa[idex2];
1195       sbuf1_i = sbuf1[idex];
1196       jmax    = sbuf1_i[0];
1197       ct1     = 2*jmax+1;
1198       ct2     = 0;
1199       rbuf2_i = rbuf2[idex2];
1200       rbuf3_i = rbuf3[idex2];
1201       for (j=1; j<=jmax; j++) {
1202         is_no   = sbuf1_i[2*j-1];
1203         max1    = sbuf1_i[2*j];
1204         lens_i  = lens[is_no];
1205         cmap_i  = cmap[is_no];
1206         rmap_i  = rmap[is_no];
1207         for (k=0; k<max1; k++,ct1++) {
1208           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1209           max2 = rbuf2_i[ct1];
1210           for (l=0; l<max2; l++,ct2++) {
1211             if (cmap_i[rbuf3_i[ct2]]) {
1212               lens_i[row]++;
1213             }
1214           }
1215         }
1216       }
1217     }
1218   }
1219   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1220   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1221   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1222   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1223   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1224 
1225   /* Create the submatrices */
1226   if (scall == MAT_REUSE_MATRIX) {
1227     PetscBool  flag;
1228 
1229     /*
1230         Assumes new rows are same length as the old rows,hence bug!
1231     */
1232     for (i=0; i<ismax; i++) {
1233       mat = (Mat_SeqAIJ *)(submats[i]->data);
1234       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1235       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1236       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1237       /* Initial matrix as if empty */
1238       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1239       submats[i]->factortype = C->factortype;
1240     }
1241   } else {
1242     for (i=0; i<ismax; i++) {
1243       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1244       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1245       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1246       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1247     }
1248   }
1249 
1250   /* Assemble the matrices */
1251   /* First assemble the local rows */
1252   {
1253     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1254     PetscScalar *imat_a;
1255 
1256     for (i=0; i<ismax; i++) {
1257       mat       = (Mat_SeqAIJ*)submats[i]->data;
1258       imat_ilen = mat->ilen;
1259       imat_j    = mat->j;
1260       imat_i    = mat->i;
1261       imat_a    = mat->a;
1262       cmap_i    = cmap[i];
1263       rmap_i    = rmap[i];
1264       irow_i    = irow[i];
1265       jmax      = nrow[i];
1266       for (j=0; j<jmax; j++) {
1267 	l = 0;
1268         row      = irow_i[j];
1269         while (row >= C->rmap->range[l+1]) l++;
1270         proc = l;
1271         if (proc == rank) {
1272           old_row  = row;
1273           row      = rmap_i[row];
1274           ilen_row = imat_ilen[row];
1275           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1276           mat_i    = imat_i[row] ;
1277           mat_a    = imat_a + mat_i;
1278           mat_j    = imat_j + mat_i;
1279           for (k=0; k<ncols; k++) {
1280             if ((tcol = cmap_i[cols[k]])) {
1281               *mat_j++ = tcol - 1;
1282               *mat_a++ = vals[k];
1283               ilen_row++;
1284             }
1285           }
1286           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1287           imat_ilen[row] = ilen_row;
1288         }
1289       }
1290     }
1291   }
1292 
1293   /*   Now assemble the off proc rows*/
1294   {
1295     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1296     PetscInt    *imat_j,*imat_i;
1297     PetscScalar *imat_a,*rbuf4_i;
1298 
1299     for (tmp2=0; tmp2<nrqs; tmp2++) {
1300       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1301       idex   = pa[idex2];
1302       sbuf1_i = sbuf1[idex];
1303       jmax    = sbuf1_i[0];
1304       ct1     = 2*jmax + 1;
1305       ct2     = 0;
1306       rbuf2_i = rbuf2[idex2];
1307       rbuf3_i = rbuf3[idex2];
1308       rbuf4_i = rbuf4[idex2];
1309       for (j=1; j<=jmax; j++) {
1310         is_no     = sbuf1_i[2*j-1];
1311         rmap_i    = rmap[is_no];
1312         cmap_i    = cmap[is_no];
1313         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1314         imat_ilen = mat->ilen;
1315         imat_j    = mat->j;
1316         imat_i    = mat->i;
1317         imat_a    = mat->a;
1318         max1      = sbuf1_i[2*j];
1319         for (k=0; k<max1; k++,ct1++) {
1320           row   = sbuf1_i[ct1];
1321           row   = rmap_i[row];
1322           ilen  = imat_ilen[row];
1323           mat_i = imat_i[row] ;
1324           mat_a = imat_a + mat_i;
1325           mat_j = imat_j + mat_i;
1326           max2 = rbuf2_i[ct1];
1327           for (l=0; l<max2; l++,ct2++) {
1328             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1329               *mat_j++ = tcol - 1;
1330               *mat_a++ = rbuf4_i[ct2];
1331               ilen++;
1332             }
1333           }
1334           imat_ilen[row] = ilen;
1335         }
1336       }
1337     }
1338   }
1339   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1340   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1341   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1342   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1343   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1344 
1345   /* Restore the indices */
1346   for (i=0; i<ismax; i++) {
1347     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1348     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1349   }
1350 
1351   /* Destroy allocated memory */
1352   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1353   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1354   ierr = PetscFree(pa);CHKERRQ(ierr);
1355 
1356   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1357   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1358   for (i=0; i<nrqr; ++i) {
1359     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1360   }
1361   for (i=0; i<nrqs; ++i) {
1362     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1363     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1364   }
1365 
1366   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1367   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1368   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1369   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1370   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1371   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1372   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1373 
1374   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1375   ierr = PetscFree(cmap);CHKERRQ(ierr);
1376   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1377   ierr = PetscFree(rmap);CHKERRQ(ierr);
1378   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1379   ierr = PetscFree(lens);CHKERRQ(ierr);
1380 
1381   for (i=0; i<ismax; i++) {
1382     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1383     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1384   }
1385   PetscFunctionReturn(0);
1386 }
1387 
1388