xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 74ed9c26ab880fdcd1d3cdbb5b1e39d1b833147d)
1 
2 #define PETSCMAT_DLL
3 
4 /*
5    Routines to compute overlapping regions of a parallel MPI matrix
6   and to find submatrices that were shared across processors.
7 */
8 #include "../src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14 EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
19 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20 {
21   PetscErrorCode ierr;
22   PetscInt       i;
23 
24   PetscFunctionBegin;
25   if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26   for (i=0; i<ov; ++i) {
27     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
28   }
29   PetscFunctionReturn(0);
30 }
31 
32 /*
33   Sample message format:
34   If a processor A wants processor B to process some elements corresponding
35   to index sets is[1],is[5]
36   mesg [0] = 2   (no of index sets in the mesg)
37   -----------
38   mesg [1] = 1 => is[1]
39   mesg [2] = sizeof(is[1]);
40   -----------
41   mesg [3] = 5  => is[5]
42   mesg [4] = sizeof(is[5]);
43   -----------
44   mesg [5]
45   mesg [n]  datas[1]
46   -----------
47   mesg[n+1]
48   mesg[m]  data(is[5])
49   -----------
50 
51   Notes:
52   nrqs - no of requests sent (or to be sent out)
53   nrqr - no of requests recieved (which have to be or which have been processed
54 */
55 #undef __FUNCT__
56 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
57 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
58 {
59   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
60   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
61   const PetscInt **idx,*idx_i;
62   PetscInt       *n,*rtable,**data,len;
63   PetscErrorCode ierr;
64   PetscMPIInt    size,rank,tag1,tag2;
65   PetscInt       m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
66   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
67   PetscBT        *table;
68   MPI_Comm       comm;
69   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
70   MPI_Status     *s_status,*recv_status;
71 
72   PetscFunctionBegin;
73   comm   = ((PetscObject)C)->comm;
74   size   = c->size;
75   rank   = c->rank;
76   m      = C->rmap->N;
77 
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
79   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
80 
81   ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr);
82 
83   for (i=0; i<imax; i++) {
84     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
85     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
86   }
87 
88   /* Create hash table for the mapping :row -> proc*/
89   for (i=0,j=0; i<size; i++) {
90     len = C->rmap->range[i+1];
91     for (; j<len; j++) {
92       rtable[j] = i;
93     }
94   }
95 
96   /* evaluate communication - mesg to who,length of mesg, and buffer space
97      required. Based on this, buffers are allocated, and data copied into them*/
98   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
99   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
100   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
101   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
102   for (i=0; i<imax; i++) {
103     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
104     idx_i = idx[i];
105     len   = n[i];
106     for (j=0; j<len; j++) {
107       row  = idx_i[j];
108       if (row < 0) {
109         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
110       }
111       proc = rtable[row];
112       w4[proc]++;
113     }
114     for (j=0; j<size; j++){
115       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
116     }
117   }
118 
119   nrqs     = 0;              /* no of outgoing messages */
120   msz      = 0;              /* total mesg length (for all proc */
121   w1[rank] = 0;              /* no mesg sent to intself */
122   w3[rank] = 0;
123   for (i=0; i<size; i++) {
124     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
125   }
126   /* pa - is list of processors to communicate with */
127   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
128   for (i=0,j=0; i<size; i++) {
129     if (w1[i]) {pa[j] = i; j++;}
130   }
131 
132   /* Each message would have a header = 1 + 2*(no of IS) + data */
133   for (i=0; i<nrqs; i++) {
134     j      = pa[i];
135     w1[j] += w2[j] + 2*w3[j];
136     msz   += w1[j];
137   }
138 
139   /* Determine the number of messages to expect, their lengths, from from-ids */
140   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
141   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
142 
143   /* Now post the Irecvs corresponding to these messages */
144   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
145 
146   /* Allocate Memory for outgoing messages */
147   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
148   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
149   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
150 
151   {
152     PetscInt *iptr = tmp,ict  = 0;
153     for (i=0; i<nrqs; i++) {
154       j         = pa[i];
155       iptr     +=  ict;
156       outdat[j] = iptr;
157       ict       = w1[j];
158     }
159   }
160 
161   /* Form the outgoing messages */
162   /*plug in the headers*/
163   for (i=0; i<nrqs; i++) {
164     j            = pa[i];
165     outdat[j][0] = 0;
166     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
167     ptr[j]       = outdat[j] + 2*w3[j] + 1;
168   }
169 
170   /* Memory for doing local proc's work*/
171   {
172     PetscInt  *d_p;
173     char      *t_p;
174 
175     /* should replace with PetscMallocN() */
176     ierr  = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
177       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr);
178     ierr  = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
179       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr);
180     data  = (PetscInt **)(table + imax);
181     isz   = (PetscInt  *)(data  + imax);
182     d_p   = (PetscInt  *)(isz   + imax);
183     t_p   = (char *)(d_p   + m*imax);
184     for (i=0; i<imax; i++) {
185       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
186       data[i]  = d_p + (m)*i;
187     }
188   }
189 
190   /* Parse the IS and update local tables and the outgoing buf with the data*/
191   {
192     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
193     PetscBT table_i;
194 
195     for (i=0; i<imax; i++) {
196       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
197       n_i     = n[i];
198       table_i = table[i];
199       idx_i   = idx[i];
200       data_i  = data[i];
201       isz_i   = isz[i];
202       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
203         row  = idx_i[j];
204         proc = rtable[row];
205         if (proc != rank) { /* copy to the outgoing buffer */
206           ctr[proc]++;
207           *ptr[proc] = row;
208           ptr[proc]++;
209         } else { /* Update the local table */
210           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
211         }
212       }
213       /* Update the headers for the current IS */
214       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
215         if ((ctr_j = ctr[j])) {
216           outdat_j        = outdat[j];
217           k               = ++outdat_j[0];
218           outdat_j[2*k]   = ctr_j;
219           outdat_j[2*k-1] = i;
220         }
221       }
222       isz[i] = isz_i;
223     }
224   }
225 
226   /*  Now  post the sends */
227   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
228   for (i=0; i<nrqs; ++i) {
229     j    = pa[i];
230     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
231   }
232 
233   /* No longer need the original indices*/
234   for (i=0; i<imax; ++i) {
235     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
236   }
237   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
238 
239   for (i=0; i<imax; ++i) {
240     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
241   }
242 
243   /* Do Local work*/
244   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
245 
246   /* Receive messages*/
247   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
248   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
249 
250   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
251   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
252 
253   /* Phase 1 sends are complete - deallocate buffers */
254   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
255   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
256 
257   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
258   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
259   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
260   ierr = PetscFree(rbuf);CHKERRQ(ierr);
261 
262 
263  /* Send the data back*/
264   /* Do a global reduction to know the buffer space req for incoming messages*/
265   {
266     PetscMPIInt *rw1;
267 
268     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
269     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
270 
271     for (i=0; i<nrqr; ++i) {
272       proc      = recv_status[i].MPI_SOURCE;
273       if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
274       rw1[proc] = isz1[i];
275     }
276     ierr = PetscFree(onodes1);CHKERRQ(ierr);
277     ierr = PetscFree(olengths1);CHKERRQ(ierr);
278 
279     /* Determine the number of messages to expect, their lengths, from from-ids */
280     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
281     ierr = PetscFree(rw1);CHKERRQ(ierr);
282   }
283   /* Now post the Irecvs corresponding to these messages */
284   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
285 
286   /*  Now  post the sends */
287   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
288   for (i=0; i<nrqr; ++i) {
289     j    = recv_status[i].MPI_SOURCE;
290     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
291   }
292 
293   /* receive work done on other processors*/
294   {
295     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
296     PetscMPIInt idex;
297     PetscBT     table_i;
298     MPI_Status  *status2;
299 
300     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
301     for (i=0; i<nrqs; ++i) {
302       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
303       /* Process the message*/
304       rbuf2_i = rbuf2[idex];
305       ct1     = 2*rbuf2_i[0]+1;
306       jmax    = rbuf2[idex][0];
307       for (j=1; j<=jmax; j++) {
308         max     = rbuf2_i[2*j];
309         is_no   = rbuf2_i[2*j-1];
310         isz_i   = isz[is_no];
311         data_i  = data[is_no];
312         table_i = table[is_no];
313         for (k=0; k<max; k++,ct1++) {
314           row = rbuf2_i[ct1];
315           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
316         }
317         isz[is_no] = isz_i;
318       }
319     }
320 
321     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
322     ierr = PetscFree(status2);CHKERRQ(ierr);
323   }
324 
325   for (i=0; i<imax; ++i) {
326     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
327   }
328 
329   ierr = PetscFree(onodes2);CHKERRQ(ierr);
330   ierr = PetscFree(olengths2);CHKERRQ(ierr);
331 
332   ierr = PetscFree(pa);CHKERRQ(ierr);
333   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
334   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
335   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
337   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(table);CHKERRQ(ierr);
339   ierr = PetscFree(s_status);CHKERRQ(ierr);
340   ierr = PetscFree(recv_status);CHKERRQ(ierr);
341   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
342   ierr = PetscFree(xdata);CHKERRQ(ierr);
343   ierr = PetscFree(isz1);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
349 /*
350    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
351        the work on the local processor.
352 
353      Inputs:
354       C      - MAT_MPIAIJ;
355       imax - total no of index sets processed at a time;
356       table  - an array of char - size = m bits.
357 
358      Output:
359       isz    - array containing the count of the solution elements correspondign
360                to each index set;
361       data   - pointer to the solutions
362 */
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
364 {
365   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
366   Mat        A = c->A,B = c->B;
367   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
368   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
369   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
370   PetscBT    table_i;
371 
372   PetscFunctionBegin;
373   rstart = C->rmap->rstart;
374   cstart = C->cmap->rstart;
375   ai     = a->i;
376   aj     = a->j;
377   bi     = b->i;
378   bj     = b->j;
379   garray = c->garray;
380 
381 
382   for (i=0; i<imax; i++) {
383     data_i  = data[i];
384     table_i = table[i];
385     isz_i   = isz[i];
386     for (j=0,max=isz[i]; j<max; j++) {
387       row   = data_i[j] - rstart;
388       start = ai[row];
389       end   = ai[row+1];
390       for (k=start; k<end; k++) { /* Amat */
391         val = aj[k] + cstart;
392         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
393       }
394       start = bi[row];
395       end   = bi[row+1];
396       for (k=start; k<end; k++) { /* Bmat */
397         val = garray[bj[k]];
398         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399       }
400     }
401     isz[i] = isz_i;
402   }
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
408 /*
409       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
410          and return the output
411 
412          Input:
413            C    - the matrix
414            nrqr - no of messages being processed.
415            rbuf - an array of pointers to the recieved requests
416 
417          Output:
418            xdata - array of messages to be sent back
419            isz1  - size of each message
420 
421   For better efficiency perhaps we should malloc separately each xdata[i],
422 then if a remalloc is required we need only copy the data for that one row
423 rather then all previous rows as it is now where a single large chunck of
424 memory is used.
425 
426 */
427 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
428 {
429   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
430   Mat            A = c->A,B = c->B;
431   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
432   PetscErrorCode ierr;
433   PetscMPIInt    rank;
434   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
435   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
436   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
437   PetscInt       *rbuf_i,kmax,rbuf_0;
438   PetscBT        xtable;
439 
440   PetscFunctionBegin;
441   rank   = c->rank;
442   m      = C->rmap->N;
443   rstart = C->rmap->rstart;
444   cstart = C->cmap->rstart;
445   ai     = a->i;
446   aj     = a->j;
447   bi     = b->i;
448   bj     = b->j;
449   garray = c->garray;
450 
451 
452   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
453     rbuf_i  =  rbuf[i];
454     rbuf_0  =  rbuf_i[0];
455     ct     += rbuf_0;
456     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
457   }
458 
459   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
460   else      max1 = 1;
461   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
462   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
463   ++no_malloc;
464   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
465   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
466 
467   ct3 = 0;
468   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
469     rbuf_i =  rbuf[i];
470     rbuf_0 =  rbuf_i[0];
471     ct1    =  2*rbuf_0+1;
472     ct2    =  ct1;
473     ct3    += ct1;
474     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
475       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
476       oct2 = ct2;
477       kmax = rbuf_i[2*j];
478       for (k=0; k<kmax; k++,ct1++) {
479         row = rbuf_i[ct1];
480         if (!PetscBTLookupSet(xtable,row)) {
481           if (!(ct3 < mem_estimate)) {
482             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
483             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
484             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
485             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
486             xdata[0]     = tmp;
487             mem_estimate = new_estimate; ++no_malloc;
488             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
489           }
490           xdata[i][ct2++] = row;
491           ct3++;
492         }
493       }
494       for (k=oct2,max2=ct2; k<max2; k++) {
495         row   = xdata[i][k] - rstart;
496         start = ai[row];
497         end   = ai[row+1];
498         for (l=start; l<end; l++) {
499           val = aj[l] + cstart;
500           if (!PetscBTLookupSet(xtable,val)) {
501             if (!(ct3 < mem_estimate)) {
502               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
503               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
504               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
505               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
506               xdata[0]     = tmp;
507               mem_estimate = new_estimate; ++no_malloc;
508               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
509             }
510             xdata[i][ct2++] = val;
511             ct3++;
512           }
513         }
514         start = bi[row];
515         end   = bi[row+1];
516         for (l=start; l<end; l++) {
517           val = garray[bj[l]];
518           if (!PetscBTLookupSet(xtable,val)) {
519             if (!(ct3 < mem_estimate)) {
520               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
521               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
522               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
523               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
524               xdata[0]     = tmp;
525               mem_estimate = new_estimate; ++no_malloc;
526               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
527             }
528             xdata[i][ct2++] = val;
529             ct3++;
530           }
531         }
532       }
533       /* Update the header*/
534       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
535       xdata[i][2*j-1] = rbuf_i[2*j-1];
536     }
537     xdata[i][0] = rbuf_0;
538     xdata[i+1]  = xdata[i] + ct2;
539     isz1[i]     = ct2; /* size of each message */
540   }
541   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
542   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
543   PetscFunctionReturn(0);
544 }
545 /* -------------------------------------------------------------------------*/
546 EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
547 EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
548 /*
549     Every processor gets the entire matrix
550 */
551 #undef __FUNCT__
552 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
553 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
554 {
555   Mat            B;
556   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
557   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
558   PetscErrorCode ierr;
559   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
560   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
561   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
562   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
563 
564   PetscFunctionBegin;
565   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
566   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
567 
568   if (scall == MAT_INITIAL_MATRIX) {
569     /* ----------------------------------------------------------------
570          Tell every processor the number of nonzeros per row
571     */
572     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
573     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
574       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
575     }
576     sendcount = A->rmap->rend - A->rmap->rstart;
577     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
578     for (i=0; i<size; i++) {
579       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
580       displs[i]     = A->rmap->range[i];
581     }
582 #if defined(PETSC_HAVE_MPI_IN_PLACE)
583     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
584 #else
585     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
586 #endif
587     /* ---------------------------------------------------------------
588          Create the sequential matrix of the same type as the local block diagonal
589     */
590     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
591     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
592     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
593     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
594     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
595     **Bin = B;
596     b = (Mat_SeqAIJ *)B->data;
597 
598     /*--------------------------------------------------------------------
599        Copy my part of matrix column indices over
600     */
601     sendcount  = ad->nz + bd->nz;
602     jsendbuf   = b->j + b->i[rstarts[rank]];
603     a_jsendbuf = ad->j;
604     b_jsendbuf = bd->j;
605     n          = A->rmap->rend - A->rmap->rstart;
606     cnt        = 0;
607     for (i=0; i<n; i++) {
608 
609       /* put in lower diagonal portion */
610       m = bd->i[i+1] - bd->i[i];
611       while (m > 0) {
612         /* is it above diagonal (in bd (compressed) numbering) */
613         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
614         jsendbuf[cnt++] = garray[*b_jsendbuf++];
615         m--;
616       }
617 
618       /* put in diagonal portion */
619       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
620         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
621       }
622 
623       /* put in upper diagonal portion */
624       while (m-- > 0) {
625         jsendbuf[cnt++] = garray[*b_jsendbuf++];
626       }
627     }
628     if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
629 
630     /*--------------------------------------------------------------------
631        Gather all column indices to all processors
632     */
633     for (i=0; i<size; i++) {
634       recvcounts[i] = 0;
635       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
636         recvcounts[i] += lens[j];
637       }
638     }
639     displs[0]  = 0;
640     for (i=1; i<size; i++) {
641       displs[i] = displs[i-1] + recvcounts[i-1];
642     }
643 #if defined(PETSC_HAVE_MPI_IN_PLACE)
644     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
645 #else
646     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
647 #endif
648     /*--------------------------------------------------------------------
649         Assemble the matrix into useable form (note numerical values not yet set)
650     */
651     /* set the b->ilen (length of each row) values */
652     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
653     /* set the b->i indices */
654     b->i[0] = 0;
655     for (i=1; i<=A->rmap->N; i++) {
656       b->i[i] = b->i[i-1] + lens[i-1];
657     }
658     ierr = PetscFree(lens);CHKERRQ(ierr);
659     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
660     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
661 
662   } else {
663     B  = **Bin;
664     b = (Mat_SeqAIJ *)B->data;
665   }
666 
667   /*--------------------------------------------------------------------
668        Copy my part of matrix numerical values into the values location
669   */
670   if (flag == MAT_GET_VALUES){
671     sendcount = ad->nz + bd->nz;
672     sendbuf   = b->a + b->i[rstarts[rank]];
673     a_sendbuf = ad->a;
674     b_sendbuf = bd->a;
675     b_sendj   = bd->j;
676     n         = A->rmap->rend - A->rmap->rstart;
677     cnt       = 0;
678     for (i=0; i<n; i++) {
679 
680       /* put in lower diagonal portion */
681       m = bd->i[i+1] - bd->i[i];
682       while (m > 0) {
683         /* is it above diagonal (in bd (compressed) numbering) */
684         if (garray[*b_sendj] > A->rmap->rstart + i) break;
685         sendbuf[cnt++] = *b_sendbuf++;
686         m--;
687         b_sendj++;
688       }
689 
690       /* put in diagonal portion */
691       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
692         sendbuf[cnt++] = *a_sendbuf++;
693       }
694 
695       /* put in upper diagonal portion */
696       while (m-- > 0) {
697         sendbuf[cnt++] = *b_sendbuf++;
698         b_sendj++;
699       }
700     }
701     if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
702 
703     /* -----------------------------------------------------------------
704        Gather all numerical values to all processors
705     */
706     if (!recvcounts) {
707       ierr   = PetscMalloc2(size,PetscInt,&recvcounts,size,PetscInt,&displs);CHKERRQ(ierr);
708     }
709     for (i=0; i<size; i++) {
710       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
711     }
712     displs[0]  = 0;
713     for (i=1; i<size; i++) {
714       displs[i] = displs[i-1] + recvcounts[i-1];
715     }
716     recvbuf   = b->a;
717 #if defined(PETSC_HAVE_MPI_IN_PLACE)
718     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
719 #else
720     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
721 #endif
722   }  /* endof (flag == MAT_GET_VALUES) */
723   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
724 
725   if (A->symmetric){
726     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
727   } else if (A->hermitian) {
728     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
729   } else if (A->structurally_symmetric) {
730     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
731   }
732   PetscFunctionReturn(0);
733 }
734 
735 #undef __FUNCT__
736 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
737 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
738 {
739   PetscErrorCode ierr;
740   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
741   PetscTruth     rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
742 
743   PetscFunctionBegin;
744   /*
745        Check for special case each processor gets entire matrix
746   */
747   if (ismax == 1 && C->rmap->N == C->cmap->N) {
748     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
749     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
750     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
751     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
752     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
753       wantallmatrix = PETSC_TRUE;
754       ierr = PetscOptionsGetTruth(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
755     }
756   }
757   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
758   if (twantallmatrix) {
759     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
760     PetscFunctionReturn(0);
761   }
762 
763   /* Allocate memory to hold all the submatrices */
764   if (scall != MAT_REUSE_MATRIX) {
765     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
766   }
767   /* Determine the number of stages through which submatrices are done */
768   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
769   if (!nmax) nmax = 1;
770   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
771 
772   /* Make sure every processor loops through the nstages */
773   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
774 
775   for (i=0,pos=0; i<nstages; i++) {
776     if (pos+nmax <= ismax) max_no = nmax;
777     else if (pos == ismax) max_no = 0;
778     else                   max_no = ismax-pos;
779     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
780     pos += max_no;
781   }
782   PetscFunctionReturn(0);
783 }
784 
785 /* -------------------------------------------------------------------------*/
786 #undef __FUNCT__
787 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
788 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
789 {
790   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
791   Mat            A = c->A;
792   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
793   const PetscInt **icol,**irow;
794   PetscInt       *nrow,*ncol,start;
795   PetscErrorCode ierr;
796   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
797   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
798   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
799   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
800   PetscInt       **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
801   const PetscInt *irow_i;
802   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
803   PetscInt       *rmap_i;
804   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
805   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
806   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
807   MPI_Status     *r_status3,*r_status4,*s_status4;
808   MPI_Comm       comm;
809   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
810   PetscTruth     sorted;
811   PetscMPIInt    *onodes1,*olengths1;
812   PetscMPIInt    idex,idex2,end;
813 
814   PetscFunctionBegin;
815   comm   = ((PetscObject)C)->comm;
816   tag0   = ((PetscObject)C)->tag;
817   size   = c->size;
818   rank   = c->rank;
819 
820   /* Get some new tags to keep the communication clean */
821   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
822   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
823   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
824 
825     /* Check if the col indices are sorted */
826   for (i=0; i<ismax; i++) {
827     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
828     /*if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
829     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
830     /*    if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
831   }
832   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
833 
834   for (i=0; i<ismax; i++) {
835     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
836     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
837     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
838     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
839   }
840 
841   /* evaluate communication - mesg to who, length of mesg, and buffer space
842      required. Based on this, buffers are allocated, and data copied into them*/
843   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
844   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
845   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
846   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
847   for (i=0; i<ismax; i++) {
848     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
849     jmax   = nrow[i];
850     irow_i = irow[i];
851     for (j=0; j<jmax; j++) {
852       l = 0;
853       row  = irow_i[j];
854       while (row >= C->rmap->range[l+1]) l++;
855       proc = l;
856       w4[proc]++;
857     }
858     for (j=0; j<size; j++) {
859       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
860     }
861   }
862 
863   nrqs     = 0;              /* no of outgoing messages */
864   msz      = 0;              /* total mesg length (for all procs) */
865   w1[rank] = 0;              /* no mesg sent to self */
866   w3[rank] = 0;
867   for (i=0; i<size; i++) {
868     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
869   }
870   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
871   for (i=0,j=0; i<size; i++) {
872     if (w1[i]) { pa[j] = i; j++; }
873   }
874 
875   /* Each message would have a header = 1 + 2*(no of IS) + data */
876   for (i=0; i<nrqs; i++) {
877     j     = pa[i];
878     w1[j] += w2[j] + 2* w3[j];
879     msz   += w1[j];
880   }
881 
882   /* Determine the number of messages to expect, their lengths, from from-ids */
883   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
884   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
885 
886   /* Now post the Irecvs corresponding to these messages */
887   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
888 
889   ierr = PetscFree(onodes1);CHKERRQ(ierr);
890   ierr = PetscFree(olengths1);CHKERRQ(ierr);
891 
892   /* Allocate Memory for outgoing messages */
893   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
894   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
895   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
896 
897   {
898     PetscInt *iptr = tmp,ict = 0;
899     for (i=0; i<nrqs; i++) {
900       j         = pa[i];
901       iptr     += ict;
902       sbuf1[j]  = iptr;
903       ict       = w1[j];
904     }
905   }
906 
907   /* Form the outgoing messages */
908   /* Initialize the header space */
909   for (i=0; i<nrqs; i++) {
910     j           = pa[i];
911     sbuf1[j][0] = 0;
912     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
913     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
914   }
915 
916   /* Parse the isrow and copy data into outbuf */
917   for (i=0; i<ismax; i++) {
918     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
919     irow_i = irow[i];
920     jmax   = nrow[i];
921     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
922       l = 0;
923       row  = irow_i[j];
924       while (row >= C->rmap->range[l+1]) l++;
925       proc = l;
926       if (proc != rank) { /* copy to the outgoing buf*/
927         ctr[proc]++;
928         *ptr[proc] = row;
929         ptr[proc]++;
930       }
931     }
932     /* Update the headers for the current IS */
933     for (j=0; j<size; j++) { /* Can Optimise this loop too */
934       if ((ctr_j = ctr[j])) {
935         sbuf1_j        = sbuf1[j];
936         k              = ++sbuf1_j[0];
937         sbuf1_j[2*k]   = ctr_j;
938         sbuf1_j[2*k-1] = i;
939       }
940     }
941   }
942 
943   /*  Now  post the sends */
944   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
945   for (i=0; i<nrqs; ++i) {
946     j    = pa[i];
947     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
948   }
949 
950   /* Post Receives to capture the buffer size */
951   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
952   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
953   rbuf2[0] = tmp + msz;
954   for (i=1; i<nrqs; ++i) {
955     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
956   }
957   for (i=0; i<nrqs; ++i) {
958     j    = pa[i];
959     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
960   }
961 
962   /* Send to other procs the buf size they should allocate */
963 
964 
965   /* Receive messages*/
966   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
967   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
968   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
969   {
970     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
971     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
972     PetscInt    *sbuf2_i;
973 
974     for (i=0; i<nrqr; ++i) {
975       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
976       req_size[idex] = 0;
977       rbuf1_i         = rbuf1[idex];
978       start           = 2*rbuf1_i[0] + 1;
979       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
980       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
981       sbuf2_i         = sbuf2[idex];
982       for (j=start; j<end; j++) {
983         id               = rbuf1_i[j] - rstart;
984         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
985         sbuf2_i[j]       = ncols;
986         req_size[idex] += ncols;
987       }
988       req_source[idex] = r_status1[i].MPI_SOURCE;
989       /* form the header */
990       sbuf2_i[0]   = req_size[idex];
991       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
992       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
993     }
994   }
995   ierr = PetscFree(r_status1);CHKERRQ(ierr);
996   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
997 
998   /*  recv buffer sizes */
999   /* Receive messages*/
1000 
1001   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1002   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1003   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1004   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1005   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1006 
1007   for (i=0; i<nrqs; ++i) {
1008     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1009     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1010     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1011     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1012     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1013   }
1014   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1015   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1016 
1017   /* Wait on sends1 and sends2 */
1018   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1019   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1020 
1021   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1022   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1023   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1024   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1025   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1026   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1027 
1028   /* Now allocate buffers for a->j, and send them off */
1029   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1030   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1031   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1032   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1033 
1034   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1035   {
1036     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1037     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1038     PetscInt cend = C->cmap->rend;
1039     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1040 
1041     for (i=0; i<nrqr; i++) {
1042       rbuf1_i   = rbuf1[i];
1043       sbuf_aj_i = sbuf_aj[i];
1044       ct1       = 2*rbuf1_i[0] + 1;
1045       ct2       = 0;
1046       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1047         kmax = rbuf1[i][2*j];
1048         for (k=0; k<kmax; k++,ct1++) {
1049           row    = rbuf1_i[ct1] - rstart;
1050           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1051           ncols  = nzA + nzB;
1052           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1053 
1054           /* load the column indices for this row into cols*/
1055           cols  = sbuf_aj_i + ct2;
1056 
1057 	  lwrite = 0;
1058           for (l=0; l<nzB; l++) {
1059             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1060           }
1061           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1062           for (l=0; l<nzB; l++) {
1063             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1064           }
1065 
1066           ct2 += ncols;
1067         }
1068       }
1069       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1070     }
1071   }
1072   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1073   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1074 
1075   /* Allocate buffers for a->a, and send them off */
1076   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1077   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1078   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1079   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1080 
1081   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1082   {
1083     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1084     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1085     PetscInt    cend = C->cmap->rend;
1086     PetscInt    *b_j = b->j;
1087     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1088 
1089     for (i=0; i<nrqr; i++) {
1090       rbuf1_i   = rbuf1[i];
1091       sbuf_aa_i = sbuf_aa[i];
1092       ct1       = 2*rbuf1_i[0]+1;
1093       ct2       = 0;
1094       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1095         kmax = rbuf1_i[2*j];
1096         for (k=0; k<kmax; k++,ct1++) {
1097           row    = rbuf1_i[ct1] - rstart;
1098           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1099           ncols  = nzA + nzB;
1100           cworkB = b_j + b_i[row];
1101           vworkA = a_a + a_i[row];
1102           vworkB = b_a + b_i[row];
1103 
1104           /* load the column values for this row into vals*/
1105           vals  = sbuf_aa_i+ct2;
1106 
1107 	  lwrite = 0;
1108           for (l=0; l<nzB; l++) {
1109             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1110           }
1111           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1112           for (l=0; l<nzB; l++) {
1113             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1114           }
1115 
1116           ct2 += ncols;
1117         }
1118       }
1119       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1120     }
1121   }
1122   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1123   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1124   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1125 
1126   /* Form the matrix */
1127   /* create col map */
1128   {
1129     const PetscInt *icol_i;
1130 
1131     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1132     ierr    = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1133     ierr    = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1134     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1135     for (i=0; i<ismax; i++) {
1136       jmax   = ncol[i];
1137       icol_i = icol[i];
1138       cmap_i = cmap[i];
1139       for (j=0; j<jmax; j++) {
1140         cmap_i[icol_i[j]] = j+1;
1141       }
1142     }
1143   }
1144 
1145   /* Create lens which is required for MatCreate... */
1146   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1147   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1148   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1149   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1150   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1151 
1152   /* Update lens from local data */
1153   for (i=0; i<ismax; i++) {
1154     jmax   = nrow[i];
1155     cmap_i = cmap[i];
1156     irow_i = irow[i];
1157     lens_i = lens[i];
1158     for (j=0; j<jmax; j++) {
1159       l = 0;
1160       row  = irow_i[j];
1161       while (row >= C->rmap->range[l+1]) l++;
1162       proc = l;
1163       if (proc == rank) {
1164         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1165         for (k=0; k<ncols; k++) {
1166           if (cmap_i[cols[k]]) { lens_i[j]++;}
1167         }
1168         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1169       }
1170     }
1171   }
1172 
1173   /* Create row map*/
1174   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1175   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1176   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1177   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1178   for (i=0; i<ismax; i++) {
1179     rmap_i = rmap[i];
1180     irow_i = irow[i];
1181     jmax   = nrow[i];
1182     for (j=0; j<jmax; j++) {
1183       rmap_i[irow_i[j]] = j;
1184     }
1185   }
1186 
1187   /* Update lens from offproc data */
1188   {
1189     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1190 
1191     for (tmp2=0; tmp2<nrqs; tmp2++) {
1192       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1193       idex   = pa[idex2];
1194       sbuf1_i = sbuf1[idex];
1195       jmax    = sbuf1_i[0];
1196       ct1     = 2*jmax+1;
1197       ct2     = 0;
1198       rbuf2_i = rbuf2[idex2];
1199       rbuf3_i = rbuf3[idex2];
1200       for (j=1; j<=jmax; j++) {
1201         is_no   = sbuf1_i[2*j-1];
1202         max1    = sbuf1_i[2*j];
1203         lens_i  = lens[is_no];
1204         cmap_i  = cmap[is_no];
1205         rmap_i  = rmap[is_no];
1206         for (k=0; k<max1; k++,ct1++) {
1207           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1208           max2 = rbuf2_i[ct1];
1209           for (l=0; l<max2; l++,ct2++) {
1210             if (cmap_i[rbuf3_i[ct2]]) {
1211               lens_i[row]++;
1212             }
1213           }
1214         }
1215       }
1216     }
1217   }
1218   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1219   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1220   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1221   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1222   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1223 
1224   /* Create the submatrices */
1225   if (scall == MAT_REUSE_MATRIX) {
1226     PetscTruth flag;
1227 
1228     /*
1229         Assumes new rows are same length as the old rows,hence bug!
1230     */
1231     for (i=0; i<ismax; i++) {
1232       mat = (Mat_SeqAIJ *)(submats[i]->data);
1233       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) {
1234         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1235       }
1236       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1237       if (!flag) {
1238         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1239       }
1240       /* Initial matrix as if empty */
1241       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1242       submats[i]->factor = C->factor;
1243     }
1244   } else {
1245     for (i=0; i<ismax; i++) {
1246       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1247       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1248       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1249       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1250     }
1251   }
1252 
1253   /* Assemble the matrices */
1254   /* First assemble the local rows */
1255   {
1256     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1257     PetscScalar *imat_a;
1258 
1259     for (i=0; i<ismax; i++) {
1260       mat       = (Mat_SeqAIJ*)submats[i]->data;
1261       imat_ilen = mat->ilen;
1262       imat_j    = mat->j;
1263       imat_i    = mat->i;
1264       imat_a    = mat->a;
1265       cmap_i    = cmap[i];
1266       rmap_i    = rmap[i];
1267       irow_i    = irow[i];
1268       jmax      = nrow[i];
1269       for (j=0; j<jmax; j++) {
1270 	l = 0;
1271         row      = irow_i[j];
1272         while (row >= C->rmap->range[l+1]) l++;
1273         proc = l;
1274         if (proc == rank) {
1275           old_row  = row;
1276           row      = rmap_i[row];
1277           ilen_row = imat_ilen[row];
1278           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1279           mat_i    = imat_i[row] ;
1280           mat_a    = imat_a + mat_i;
1281           mat_j    = imat_j + mat_i;
1282           for (k=0; k<ncols; k++) {
1283             if ((tcol = cmap_i[cols[k]])) {
1284               *mat_j++ = tcol - 1;
1285               *mat_a++ = vals[k];
1286               ilen_row++;
1287             }
1288           }
1289           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1290           imat_ilen[row] = ilen_row;
1291         }
1292       }
1293     }
1294   }
1295 
1296   /*   Now assemble the off proc rows*/
1297   {
1298     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1299     PetscInt    *imat_j,*imat_i;
1300     PetscScalar *imat_a,*rbuf4_i;
1301 
1302     for (tmp2=0; tmp2<nrqs; tmp2++) {
1303       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1304       idex   = pa[idex2];
1305       sbuf1_i = sbuf1[idex];
1306       jmax    = sbuf1_i[0];
1307       ct1     = 2*jmax + 1;
1308       ct2     = 0;
1309       rbuf2_i = rbuf2[idex2];
1310       rbuf3_i = rbuf3[idex2];
1311       rbuf4_i = rbuf4[idex2];
1312       for (j=1; j<=jmax; j++) {
1313         is_no     = sbuf1_i[2*j-1];
1314         rmap_i    = rmap[is_no];
1315         cmap_i    = cmap[is_no];
1316         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1317         imat_ilen = mat->ilen;
1318         imat_j    = mat->j;
1319         imat_i    = mat->i;
1320         imat_a    = mat->a;
1321         max1      = sbuf1_i[2*j];
1322         for (k=0; k<max1; k++,ct1++) {
1323           row   = sbuf1_i[ct1];
1324           row   = rmap_i[row];
1325           ilen  = imat_ilen[row];
1326           mat_i = imat_i[row] ;
1327           mat_a = imat_a + mat_i;
1328           mat_j = imat_j + mat_i;
1329           max2 = rbuf2_i[ct1];
1330           for (l=0; l<max2; l++,ct2++) {
1331             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1332               *mat_j++ = tcol - 1;
1333               *mat_a++ = rbuf4_i[ct2];
1334               ilen++;
1335             }
1336           }
1337           imat_ilen[row] = ilen;
1338         }
1339       }
1340     }
1341   }
1342   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1343   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1344   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1345   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1346   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1347 
1348   /* Restore the indices */
1349   for (i=0; i<ismax; i++) {
1350     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1351     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1352   }
1353 
1354   /* Destroy allocated memory */
1355   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1356   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1357   ierr = PetscFree(pa);CHKERRQ(ierr);
1358 
1359   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1360   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1361   for (i=0; i<nrqr; ++i) {
1362     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1363   }
1364   for (i=0; i<nrqs; ++i) {
1365     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1366     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1367   }
1368 
1369   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1370   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1371   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1372   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1373   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1374   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1375   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1376 
1377   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1378   ierr = PetscFree(cmap);CHKERRQ(ierr);
1379   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1380   ierr = PetscFree(rmap);CHKERRQ(ierr);
1381   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1382   ierr = PetscFree(lens);CHKERRQ(ierr);
1383 
1384   for (i=0; i<ismax; i++) {
1385     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1386     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1387   }
1388   PetscFunctionReturn(0);
1389 }
1390 
1391