xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 61d68fdc9e13fb35ae33ecaba251e21045a804dc)
1 
2 #define PETSCMAT_DLL
3 
4 /*
5    Routines to compute overlapping regions of a parallel MPI matrix
6   and to find submatrices that were shared across processors.
7 */
8 #include "../src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14 EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
19 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20 {
21   PetscErrorCode ierr;
22   PetscInt       i;
23 
24   PetscFunctionBegin;
25   if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26   for (i=0; i<ov; ++i) {
27     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
28   }
29   PetscFunctionReturn(0);
30 }
31 
32 /*
33   Sample message format:
34   If a processor A wants processor B to process some elements corresponding
35   to index sets is[1],is[5]
36   mesg [0] = 2   (no of index sets in the mesg)
37   -----------
38   mesg [1] = 1 => is[1]
39   mesg [2] = sizeof(is[1]);
40   -----------
41   mesg [3] = 5  => is[5]
42   mesg [4] = sizeof(is[5]);
43   -----------
44   mesg [5]
45   mesg [n]  datas[1]
46   -----------
47   mesg[n+1]
48   mesg[m]  data(is[5])
49   -----------
50 
51   Notes:
52   nrqs - no of requests sent (or to be sent out)
53   nrqr - no of requests recieved (which have to be or which have been processed
54 */
55 #undef __FUNCT__
56 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
57 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
58 {
59   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
60   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
61   const PetscInt **idx,*idx_i;
62   PetscInt       *n,*rtable,**data,len;
63   PetscErrorCode ierr;
64   PetscMPIInt    size,rank,tag1,tag2;
65   PetscInt       m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
66   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
67   PetscBT        *table;
68   MPI_Comm       comm;
69   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
70   MPI_Status     *s_status,*recv_status;
71 
72   PetscFunctionBegin;
73   comm   = ((PetscObject)C)->comm;
74   size   = c->size;
75   rank   = c->rank;
76   m      = C->rmap->N;
77 
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
79   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
80 
81   ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr);
82 
83   for (i=0; i<imax; i++) {
84     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
85     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
86   }
87 
88   /* Create hash table for the mapping :row -> proc*/
89   for (i=0,j=0; i<size; i++) {
90     len = C->rmap->range[i+1];
91     for (; j<len; j++) {
92       rtable[j] = i;
93     }
94   }
95 
96   /* evaluate communication - mesg to who,length of mesg, and buffer space
97      required. Based on this, buffers are allocated, and data copied into them*/
98   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
99   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
100   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
101   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
102   for (i=0; i<imax; i++) {
103     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
104     idx_i = idx[i];
105     len   = n[i];
106     for (j=0; j<len; j++) {
107       row  = idx_i[j];
108       if (row < 0) {
109         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
110       }
111       proc = rtable[row];
112       w4[proc]++;
113     }
114     for (j=0; j<size; j++){
115       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
116     }
117   }
118 
119   nrqs     = 0;              /* no of outgoing messages */
120   msz      = 0;              /* total mesg length (for all proc */
121   w1[rank] = 0;              /* no mesg sent to intself */
122   w3[rank] = 0;
123   for (i=0; i<size; i++) {
124     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
125   }
126   /* pa - is list of processors to communicate with */
127   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
128   for (i=0,j=0; i<size; i++) {
129     if (w1[i]) {pa[j] = i; j++;}
130   }
131 
132   /* Each message would have a header = 1 + 2*(no of IS) + data */
133   for (i=0; i<nrqs; i++) {
134     j      = pa[i];
135     w1[j] += w2[j] + 2*w3[j];
136     msz   += w1[j];
137   }
138 
139   /* Determine the number of messages to expect, their lengths, from from-ids */
140   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
141   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
142 
143   /* Now post the Irecvs corresponding to these messages */
144   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
145 
146   /* Allocate Memory for outgoing messages */
147   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
148   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
149   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
150 
151   {
152     PetscInt *iptr = tmp,ict  = 0;
153     for (i=0; i<nrqs; i++) {
154       j         = pa[i];
155       iptr     +=  ict;
156       outdat[j] = iptr;
157       ict       = w1[j];
158     }
159   }
160 
161   /* Form the outgoing messages */
162   /*plug in the headers*/
163   for (i=0; i<nrqs; i++) {
164     j            = pa[i];
165     outdat[j][0] = 0;
166     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
167     ptr[j]       = outdat[j] + 2*w3[j] + 1;
168   }
169 
170   /* Memory for doing local proc's work*/
171   {
172     PetscInt  *d_p;
173     char      *t_p;
174 
175     /* should replace with PetscMallocN() */
176     ierr  = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
177       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr);
178     ierr  = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
179       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr);
180     data  = (PetscInt **)(table + imax);
181     isz   = (PetscInt  *)(data  + imax);
182     d_p   = (PetscInt  *)(isz   + imax);
183     t_p   = (char *)(d_p   + m*imax);
184     for (i=0; i<imax; i++) {
185       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
186       data[i]  = d_p + (m)*i;
187     }
188   }
189 
190   /* Parse the IS and update local tables and the outgoing buf with the data*/
191   {
192     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
193     PetscBT table_i;
194 
195     for (i=0; i<imax; i++) {
196       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
197       n_i     = n[i];
198       table_i = table[i];
199       idx_i   = idx[i];
200       data_i  = data[i];
201       isz_i   = isz[i];
202       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
203         row  = idx_i[j];
204         proc = rtable[row];
205         if (proc != rank) { /* copy to the outgoing buffer */
206           ctr[proc]++;
207           *ptr[proc] = row;
208           ptr[proc]++;
209         } else { /* Update the local table */
210           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
211         }
212       }
213       /* Update the headers for the current IS */
214       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
215         if ((ctr_j = ctr[j])) {
216           outdat_j        = outdat[j];
217           k               = ++outdat_j[0];
218           outdat_j[2*k]   = ctr_j;
219           outdat_j[2*k-1] = i;
220         }
221       }
222       isz[i] = isz_i;
223     }
224   }
225 
226   /*  Now  post the sends */
227   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
228   for (i=0; i<nrqs; ++i) {
229     j    = pa[i];
230     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
231   }
232 
233   /* No longer need the original indices*/
234   for (i=0; i<imax; ++i) {
235     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
236   }
237   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
238 
239   for (i=0; i<imax; ++i) {
240     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
241   }
242 
243   /* Do Local work*/
244   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
245 
246   /* Receive messages*/
247   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
248   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
249 
250   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
251   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
252 
253   /* Phase 1 sends are complete - deallocate buffers */
254   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
255   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
256 
257   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
258   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
259   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
260   ierr = PetscFree(rbuf);CHKERRQ(ierr);
261 
262 
263  /* Send the data back*/
264   /* Do a global reduction to know the buffer space req for incoming messages*/
265   {
266     PetscMPIInt *rw1;
267 
268     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
269     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
270 
271     for (i=0; i<nrqr; ++i) {
272       proc      = recv_status[i].MPI_SOURCE;
273       if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
274       rw1[proc] = isz1[i];
275     }
276     ierr = PetscFree(onodes1);CHKERRQ(ierr);
277     ierr = PetscFree(olengths1);CHKERRQ(ierr);
278 
279     /* Determine the number of messages to expect, their lengths, from from-ids */
280     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
281     ierr = PetscFree(rw1);CHKERRQ(ierr);
282   }
283   /* Now post the Irecvs corresponding to these messages */
284   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
285 
286   /*  Now  post the sends */
287   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
288   for (i=0; i<nrqr; ++i) {
289     j    = recv_status[i].MPI_SOURCE;
290     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
291   }
292 
293   /* receive work done on other processors*/
294   {
295     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
296     PetscMPIInt idex;
297     PetscBT     table_i;
298     MPI_Status  *status2;
299 
300     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
301     for (i=0; i<nrqs; ++i) {
302       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
303       /* Process the message*/
304       rbuf2_i = rbuf2[idex];
305       ct1     = 2*rbuf2_i[0]+1;
306       jmax    = rbuf2[idex][0];
307       for (j=1; j<=jmax; j++) {
308         max     = rbuf2_i[2*j];
309         is_no   = rbuf2_i[2*j-1];
310         isz_i   = isz[is_no];
311         data_i  = data[is_no];
312         table_i = table[is_no];
313         for (k=0; k<max; k++,ct1++) {
314           row = rbuf2_i[ct1];
315           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
316         }
317         isz[is_no] = isz_i;
318       }
319     }
320 
321     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
322     ierr = PetscFree(status2);CHKERRQ(ierr);
323   }
324 
325   for (i=0; i<imax; ++i) {
326     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
327   }
328 
329   ierr = PetscFree(onodes2);CHKERRQ(ierr);
330   ierr = PetscFree(olengths2);CHKERRQ(ierr);
331 
332   ierr = PetscFree(pa);CHKERRQ(ierr);
333   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
334   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
335   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
337   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(table);CHKERRQ(ierr);
339   ierr = PetscFree(s_status);CHKERRQ(ierr);
340   ierr = PetscFree(recv_status);CHKERRQ(ierr);
341   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
342   ierr = PetscFree(xdata);CHKERRQ(ierr);
343   ierr = PetscFree(isz1);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
349 /*
350    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
351        the work on the local processor.
352 
353      Inputs:
354       C      - MAT_MPIAIJ;
355       imax - total no of index sets processed at a time;
356       table  - an array of char - size = m bits.
357 
358      Output:
359       isz    - array containing the count of the solution elements correspondign
360                to each index set;
361       data   - pointer to the solutions
362 */
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
364 {
365   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
366   Mat        A = c->A,B = c->B;
367   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
368   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
369   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
370   PetscBT    table_i;
371 
372   PetscFunctionBegin;
373   rstart = C->rmap->rstart;
374   cstart = C->cmap->rstart;
375   ai     = a->i;
376   aj     = a->j;
377   bi     = b->i;
378   bj     = b->j;
379   garray = c->garray;
380 
381 
382   for (i=0; i<imax; i++) {
383     data_i  = data[i];
384     table_i = table[i];
385     isz_i   = isz[i];
386     for (j=0,max=isz[i]; j<max; j++) {
387       row   = data_i[j] - rstart;
388       start = ai[row];
389       end   = ai[row+1];
390       for (k=start; k<end; k++) { /* Amat */
391         val = aj[k] + cstart;
392         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
393       }
394       start = bi[row];
395       end   = bi[row+1];
396       for (k=start; k<end; k++) { /* Bmat */
397         val = garray[bj[k]];
398         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399       }
400     }
401     isz[i] = isz_i;
402   }
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
408 /*
409       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
410          and return the output
411 
412          Input:
413            C    - the matrix
414            nrqr - no of messages being processed.
415            rbuf - an array of pointers to the recieved requests
416 
417          Output:
418            xdata - array of messages to be sent back
419            isz1  - size of each message
420 
421   For better efficiency perhaps we should malloc separately each xdata[i],
422 then if a remalloc is required we need only copy the data for that one row
423 rather then all previous rows as it is now where a single large chunck of
424 memory is used.
425 
426 */
427 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
428 {
429   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
430   Mat            A = c->A,B = c->B;
431   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
432   PetscErrorCode ierr;
433   PetscMPIInt    rank;
434   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
435   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
436   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
437   PetscInt       *rbuf_i,kmax,rbuf_0;
438   PetscBT        xtable;
439 
440   PetscFunctionBegin;
441   rank   = c->rank;
442   m      = C->rmap->N;
443   rstart = C->rmap->rstart;
444   cstart = C->cmap->rstart;
445   ai     = a->i;
446   aj     = a->j;
447   bi     = b->i;
448   bj     = b->j;
449   garray = c->garray;
450 
451 
452   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
453     rbuf_i  =  rbuf[i];
454     rbuf_0  =  rbuf_i[0];
455     ct     += rbuf_0;
456     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
457   }
458 
459   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
460   else      max1 = 1;
461   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
462   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
463   ++no_malloc;
464   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
465   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
466 
467   ct3 = 0;
468   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
469     rbuf_i =  rbuf[i];
470     rbuf_0 =  rbuf_i[0];
471     ct1    =  2*rbuf_0+1;
472     ct2    =  ct1;
473     ct3    += ct1;
474     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
475       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
476       oct2 = ct2;
477       kmax = rbuf_i[2*j];
478       for (k=0; k<kmax; k++,ct1++) {
479         row = rbuf_i[ct1];
480         if (!PetscBTLookupSet(xtable,row)) {
481           if (!(ct3 < mem_estimate)) {
482             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
483             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
484             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
485             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
486             xdata[0]     = tmp;
487             mem_estimate = new_estimate; ++no_malloc;
488             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
489           }
490           xdata[i][ct2++] = row;
491           ct3++;
492         }
493       }
494       for (k=oct2,max2=ct2; k<max2; k++) {
495         row   = xdata[i][k] - rstart;
496         start = ai[row];
497         end   = ai[row+1];
498         for (l=start; l<end; l++) {
499           val = aj[l] + cstart;
500           if (!PetscBTLookupSet(xtable,val)) {
501             if (!(ct3 < mem_estimate)) {
502               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
503               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
504               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
505               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
506               xdata[0]     = tmp;
507               mem_estimate = new_estimate; ++no_malloc;
508               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
509             }
510             xdata[i][ct2++] = val;
511             ct3++;
512           }
513         }
514         start = bi[row];
515         end   = bi[row+1];
516         for (l=start; l<end; l++) {
517           val = garray[bj[l]];
518           if (!PetscBTLookupSet(xtable,val)) {
519             if (!(ct3 < mem_estimate)) {
520               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
521               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
522               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
523               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
524               xdata[0]     = tmp;
525               mem_estimate = new_estimate; ++no_malloc;
526               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
527             }
528             xdata[i][ct2++] = val;
529             ct3++;
530           }
531         }
532       }
533       /* Update the header*/
534       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
535       xdata[i][2*j-1] = rbuf_i[2*j-1];
536     }
537     xdata[i][0] = rbuf_0;
538     xdata[i+1]  = xdata[i] + ct2;
539     isz1[i]     = ct2; /* size of each message */
540   }
541   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
542   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
543   PetscFunctionReturn(0);
544 }
545 /* -------------------------------------------------------------------------*/
546 EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
547 EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
548 /*
549     Every processor gets the entire matrix
550 */
551 #undef __FUNCT__
552 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
553 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
554 {
555   Mat            B;
556   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
557   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
558   PetscErrorCode ierr;
559   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
560   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
561   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
562   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
563 
564   PetscFunctionBegin;
565   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
566   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
567 
568   if (scall == MAT_INITIAL_MATRIX) {
569     /* ----------------------------------------------------------------
570          Tell every processor the number of nonzeros per row
571     */
572     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
573     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
574       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
575     }
576     sendcount = A->rmap->rend - A->rmap->rstart;
577     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
578     for (i=0; i<size; i++) {
579       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
580       displs[i]     = A->rmap->range[i];
581     }
582 #if defined(PETSC_HAVE_MPI_IN_PLACE)
583     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
584 #else
585     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
586 #endif
587     /* ---------------------------------------------------------------
588          Create the sequential matrix of the same type as the local block diagonal
589     */
590     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
591     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
592     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
593     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
594     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
595     **Bin = B;
596     b = (Mat_SeqAIJ *)B->data;
597 
598     /*--------------------------------------------------------------------
599        Copy my part of matrix column indices over
600     */
601     sendcount  = ad->nz + bd->nz;
602     jsendbuf   = b->j + b->i[rstarts[rank]];
603     a_jsendbuf = ad->j;
604     b_jsendbuf = bd->j;
605     n          = A->rmap->rend - A->rmap->rstart;
606     cnt        = 0;
607     for (i=0; i<n; i++) {
608 
609       /* put in lower diagonal portion */
610       m = bd->i[i+1] - bd->i[i];
611       while (m > 0) {
612         /* is it above diagonal (in bd (compressed) numbering) */
613         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
614         jsendbuf[cnt++] = garray[*b_jsendbuf++];
615         m--;
616       }
617 
618       /* put in diagonal portion */
619       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
620         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
621       }
622 
623       /* put in upper diagonal portion */
624       while (m-- > 0) {
625         jsendbuf[cnt++] = garray[*b_jsendbuf++];
626       }
627     }
628     if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
629 
630     /*--------------------------------------------------------------------
631        Gather all column indices to all processors
632     */
633     for (i=0; i<size; i++) {
634       recvcounts[i] = 0;
635       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
636         recvcounts[i] += lens[j];
637       }
638     }
639     displs[0]  = 0;
640     for (i=1; i<size; i++) {
641       displs[i] = displs[i-1] + recvcounts[i-1];
642     }
643 #if defined(PETSC_HAVE_MPI_IN_PLACE)
644     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
645 #else
646     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
647 #endif
648     /*--------------------------------------------------------------------
649         Assemble the matrix into useable form (note numerical values not yet set)
650     */
651     /* set the b->ilen (length of each row) values */
652     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
653     /* set the b->i indices */
654     b->i[0] = 0;
655     for (i=1; i<=A->rmap->N; i++) {
656       b->i[i] = b->i[i-1] + lens[i-1];
657     }
658     ierr = PetscFree(lens);CHKERRQ(ierr);
659     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
660     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
661 
662   } else {
663     B  = **Bin;
664     b = (Mat_SeqAIJ *)B->data;
665   }
666 
667   /*--------------------------------------------------------------------
668        Copy my part of matrix numerical values into the values location
669   */
670   if (flag == MAT_GET_VALUES){
671     sendcount = ad->nz + bd->nz;
672     sendbuf   = b->a + b->i[rstarts[rank]];
673     a_sendbuf = ad->a;
674     b_sendbuf = bd->a;
675     b_sendj   = bd->j;
676     n         = A->rmap->rend - A->rmap->rstart;
677     cnt       = 0;
678     for (i=0; i<n; i++) {
679 
680       /* put in lower diagonal portion */
681       m = bd->i[i+1] - bd->i[i];
682       while (m > 0) {
683         /* is it above diagonal (in bd (compressed) numbering) */
684         if (garray[*b_sendj] > A->rmap->rstart + i) break;
685         sendbuf[cnt++] = *b_sendbuf++;
686         m--;
687         b_sendj++;
688       }
689 
690       /* put in diagonal portion */
691       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
692         sendbuf[cnt++] = *a_sendbuf++;
693       }
694 
695       /* put in upper diagonal portion */
696       while (m-- > 0) {
697         sendbuf[cnt++] = *b_sendbuf++;
698         b_sendj++;
699       }
700     }
701     if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
702 
703     /* -----------------------------------------------------------------
704        Gather all numerical values to all processors
705     */
706     if (!recvcounts) {
707       ierr   = PetscMalloc2(size,PetscInt,&recvcounts,size,PetscInt,&displs);CHKERRQ(ierr);
708     }
709     for (i=0; i<size; i++) {
710       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
711     }
712     displs[0]  = 0;
713     for (i=1; i<size; i++) {
714       displs[i] = displs[i-1] + recvcounts[i-1];
715     }
716     recvbuf   = b->a;
717 #if defined(PETSC_HAVE_MPI_IN_PLACE)
718     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
719 #else
720     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
721 #endif
722   }  /* endof (flag == MAT_GET_VALUES) */
723   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
724 
725   if (A->symmetric){
726     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
727   } else if (A->hermitian) {
728     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
729   } else if (A->structurally_symmetric) {
730     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
731   }
732   PetscFunctionReturn(0);
733 }
734 
735 #undef __FUNCT__
736 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
737 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
738 {
739   PetscErrorCode ierr;
740   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
741   PetscTruth     rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
742 
743   PetscFunctionBegin;
744   /*
745        Check for special case each processor gets entire matrix
746   */
747   if (ismax == 1 && C->rmap->N == C->cmap->N) {
748     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
749     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
750     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
751     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
752     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
753       wantallmatrix = PETSC_TRUE;
754       ierr = PetscOptionsGetTruth(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
755     }
756   }
757   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
758   if (twantallmatrix) {
759     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
760     PetscFunctionReturn(0);
761   }
762 
763   /* Allocate memory to hold all the submatrices */
764   if (scall != MAT_REUSE_MATRIX) {
765     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
766   }
767   /* Determine the number of stages through which submatrices are done */
768   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
769   if (!nmax) nmax = 1;
770   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
771 
772   /* Make sure every processor loops through the nstages */
773   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
774 
775   for (i=0,pos=0; i<nstages; i++) {
776     if (pos+nmax <= ismax) max_no = nmax;
777     else if (pos == ismax) max_no = 0;
778     else                   max_no = ismax-pos;
779     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
780     pos += max_no;
781   }
782   PetscFunctionReturn(0);
783 }
784 /* -------------------------------------------------------------------------*/
785 #undef __FUNCT__
786 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
787 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
788 {
789   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
790   Mat            A = c->A;
791   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
792   const PetscInt **icol,**irow;
793   PetscInt       *nrow,*ncol,start;
794   PetscErrorCode ierr;
795   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
796   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
797   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
798   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
799   PetscInt       **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
800   const PetscInt *irow_i;
801   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
802   PetscInt       *rmap_i;
803   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
804   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
805   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
806   MPI_Status     *r_status3,*r_status4,*s_status4;
807   MPI_Comm       comm;
808   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
809   PetscTruth     sorted;
810   PetscMPIInt    *onodes1,*olengths1;
811   PetscMPIInt    idex,idex2,end;
812 
813   PetscFunctionBegin;
814   comm   = ((PetscObject)C)->comm;
815   tag0   = ((PetscObject)C)->tag;
816   size   = c->size;
817   rank   = c->rank;
818 
819   /* Get some new tags to keep the communication clean */
820   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
821   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
822   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
823 
824     /* Check if the col indices are sorted */
825   for (i=0; i<ismax; i++) {
826     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
827     /*if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
828     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
829     /*    if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
830   }
831   ierr = PetscMalloc4(ismax,PetscInt*,&irow,ismax,PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
832 
833   for (i=0; i<ismax; i++) {
834     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
835     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
836     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
837     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
838   }
839 
840   /* evaluate communication - mesg to who, length of mesg, and buffer space
841      required. Based on this, buffers are allocated, and data copied into them*/
842   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
843   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
844   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
845   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
846   for (i=0; i<ismax; i++) {
847     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
848     jmax   = nrow[i];
849     irow_i = irow[i];
850     for (j=0; j<jmax; j++) {
851       l = 0;
852       row  = irow_i[j];
853       while (row >= C->rmap->range[l+1]) l++;
854       proc = l;
855       w4[proc]++;
856     }
857     for (j=0; j<size; j++) {
858       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
859     }
860   }
861 
862   nrqs     = 0;              /* no of outgoing messages */
863   msz      = 0;              /* total mesg length (for all procs) */
864   w1[rank] = 0;              /* no mesg sent to self */
865   w3[rank] = 0;
866   for (i=0; i<size; i++) {
867     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
868   }
869   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
870   for (i=0,j=0; i<size; i++) {
871     if (w1[i]) { pa[j] = i; j++; }
872   }
873 
874   /* Each message would have a header = 1 + 2*(no of IS) + data */
875   for (i=0; i<nrqs; i++) {
876     j     = pa[i];
877     w1[j] += w2[j] + 2* w3[j];
878     msz   += w1[j];
879   }
880 
881   /* Determine the number of messages to expect, their lengths, from from-ids */
882   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
883   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
884 
885   /* Now post the Irecvs corresponding to these messages */
886   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
887 
888   ierr = PetscFree(onodes1);CHKERRQ(ierr);
889   ierr = PetscFree(olengths1);CHKERRQ(ierr);
890 
891   /* Allocate Memory for outgoing messages */
892   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
893   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
894   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
895 
896   {
897     PetscInt *iptr = tmp,ict = 0;
898     for (i=0; i<nrqs; i++) {
899       j         = pa[i];
900       iptr     += ict;
901       sbuf1[j]  = iptr;
902       ict       = w1[j];
903     }
904   }
905 
906   /* Form the outgoing messages */
907   /* Initialize the header space */
908   for (i=0; i<nrqs; i++) {
909     j           = pa[i];
910     sbuf1[j][0] = 0;
911     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
912     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
913   }
914 
915   /* Parse the isrow and copy data into outbuf */
916   for (i=0; i<ismax; i++) {
917     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
918     irow_i = irow[i];
919     jmax   = nrow[i];
920     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
921       l = 0;
922       row  = irow_i[j];
923       while (row >= C->rmap->range[l+1]) l++;
924       proc = l;
925       if (proc != rank) { /* copy to the outgoing buf*/
926         ctr[proc]++;
927         *ptr[proc] = row;
928         ptr[proc]++;
929       }
930     }
931     /* Update the headers for the current IS */
932     for (j=0; j<size; j++) { /* Can Optimise this loop too */
933       if ((ctr_j = ctr[j])) {
934         sbuf1_j        = sbuf1[j];
935         k              = ++sbuf1_j[0];
936         sbuf1_j[2*k]   = ctr_j;
937         sbuf1_j[2*k-1] = i;
938       }
939     }
940   }
941 
942   /*  Now  post the sends */
943   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
944   for (i=0; i<nrqs; ++i) {
945     j    = pa[i];
946     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
947   }
948 
949   /* Post Receives to capture the buffer size */
950   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
951   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
952   rbuf2[0] = tmp + msz;
953   for (i=1; i<nrqs; ++i) {
954     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
955   }
956   for (i=0; i<nrqs; ++i) {
957     j    = pa[i];
958     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
959   }
960 
961   /* Send to other procs the buf size they should allocate */
962 
963 
964   /* Receive messages*/
965   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
966   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
967   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
968   {
969     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
970     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
971     PetscInt    *sbuf2_i;
972 
973     for (i=0; i<nrqr; ++i) {
974       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
975       req_size[idex] = 0;
976       rbuf1_i         = rbuf1[idex];
977       start           = 2*rbuf1_i[0] + 1;
978       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
979       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
980       sbuf2_i         = sbuf2[idex];
981       for (j=start; j<end; j++) {
982         id               = rbuf1_i[j] - rstart;
983         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
984         sbuf2_i[j]       = ncols;
985         req_size[idex] += ncols;
986       }
987       req_source[idex] = r_status1[i].MPI_SOURCE;
988       /* form the header */
989       sbuf2_i[0]   = req_size[idex];
990       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
991       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
992     }
993   }
994   ierr = PetscFree(r_status1);CHKERRQ(ierr);
995   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
996 
997   /*  recv buffer sizes */
998   /* Receive messages*/
999 
1000   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1001   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1002   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1003   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1004   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1005 
1006   for (i=0; i<nrqs; ++i) {
1007     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1008     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1009     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1010     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1011     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1012   }
1013   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1014   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1015 
1016   /* Wait on sends1 and sends2 */
1017   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1018   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1019 
1020   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1021   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1022   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1023   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1024   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1025   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1026 
1027   /* Now allocate buffers for a->j, and send them off */
1028   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1029   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1030   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1031   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1032 
1033   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1034   {
1035     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1036     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1037     PetscInt cend = C->cmap->rend;
1038     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1039 
1040     for (i=0; i<nrqr; i++) {
1041       rbuf1_i   = rbuf1[i];
1042       sbuf_aj_i = sbuf_aj[i];
1043       ct1       = 2*rbuf1_i[0] + 1;
1044       ct2       = 0;
1045       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1046         kmax = rbuf1[i][2*j];
1047         for (k=0; k<kmax; k++,ct1++) {
1048           row    = rbuf1_i[ct1] - rstart;
1049           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1050           ncols  = nzA + nzB;
1051           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1052 
1053           /* load the column indices for this row into cols*/
1054           cols  = sbuf_aj_i + ct2;
1055 
1056 	  lwrite = 0;
1057           for (l=0; l<nzB; l++) {
1058             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1059           }
1060           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1061           for (l=0; l<nzB; l++) {
1062             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1063           }
1064 
1065           ct2 += ncols;
1066         }
1067       }
1068       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1069     }
1070   }
1071   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1072   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1073 
1074   /* Allocate buffers for a->a, and send them off */
1075   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1076   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1077   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1078   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1079 
1080   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1081   {
1082     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1083     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1084     PetscInt    cend = C->cmap->rend;
1085     PetscInt    *b_j = b->j;
1086     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1087 
1088     for (i=0; i<nrqr; i++) {
1089       rbuf1_i   = rbuf1[i];
1090       sbuf_aa_i = sbuf_aa[i];
1091       ct1       = 2*rbuf1_i[0]+1;
1092       ct2       = 0;
1093       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1094         kmax = rbuf1_i[2*j];
1095         for (k=0; k<kmax; k++,ct1++) {
1096           row    = rbuf1_i[ct1] - rstart;
1097           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1098           ncols  = nzA + nzB;
1099           cworkB = b_j + b_i[row];
1100           vworkA = a_a + a_i[row];
1101           vworkB = b_a + b_i[row];
1102 
1103           /* load the column values for this row into vals*/
1104           vals  = sbuf_aa_i+ct2;
1105 
1106 	  lwrite = 0;
1107           for (l=0; l<nzB; l++) {
1108             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1109           }
1110           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1111           for (l=0; l<nzB; l++) {
1112             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1113           }
1114 
1115           ct2 += ncols;
1116         }
1117       }
1118       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1119     }
1120   }
1121   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1122   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1123   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1124 
1125   /* Form the matrix */
1126   /* create col map */
1127   {
1128     const PetscInt *icol_i;
1129 
1130     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1131     ierr    = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1132     ierr    = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1133     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1134     for (i=0; i<ismax; i++) {
1135       jmax   = ncol[i];
1136       icol_i = icol[i];
1137       cmap_i = cmap[i];
1138       for (j=0; j<jmax; j++) {
1139         cmap_i[icol_i[j]] = j+1;
1140       }
1141     }
1142   }
1143 
1144   /* Create lens which is required for MatCreate... */
1145   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1146   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1147   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1148   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1149   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1150 
1151   /* Update lens from local data */
1152   for (i=0; i<ismax; i++) {
1153     jmax   = nrow[i];
1154     cmap_i = cmap[i];
1155     irow_i = irow[i];
1156     lens_i = lens[i];
1157     for (j=0; j<jmax; j++) {
1158       l = 0;
1159       row  = irow_i[j];
1160       while (row >= C->rmap->range[l+1]) l++;
1161       proc = l;
1162       if (proc == rank) {
1163         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1164         for (k=0; k<ncols; k++) {
1165           if (cmap_i[cols[k]]) { lens_i[j]++;}
1166         }
1167         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1168       }
1169     }
1170   }
1171 
1172   /* Create row map*/
1173   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1174   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1175   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1176   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1177   for (i=0; i<ismax; i++) {
1178     rmap_i = rmap[i];
1179     irow_i = irow[i];
1180     jmax   = nrow[i];
1181     for (j=0; j<jmax; j++) {
1182       rmap_i[irow_i[j]] = j;
1183     }
1184   }
1185 
1186   /* Update lens from offproc data */
1187   {
1188     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1189 
1190     for (tmp2=0; tmp2<nrqs; tmp2++) {
1191       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1192       idex   = pa[idex2];
1193       sbuf1_i = sbuf1[idex];
1194       jmax    = sbuf1_i[0];
1195       ct1     = 2*jmax+1;
1196       ct2     = 0;
1197       rbuf2_i = rbuf2[idex2];
1198       rbuf3_i = rbuf3[idex2];
1199       for (j=1; j<=jmax; j++) {
1200         is_no   = sbuf1_i[2*j-1];
1201         max1    = sbuf1_i[2*j];
1202         lens_i  = lens[is_no];
1203         cmap_i  = cmap[is_no];
1204         rmap_i  = rmap[is_no];
1205         for (k=0; k<max1; k++,ct1++) {
1206           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1207           max2 = rbuf2_i[ct1];
1208           for (l=0; l<max2; l++,ct2++) {
1209             if (cmap_i[rbuf3_i[ct2]]) {
1210               lens_i[row]++;
1211             }
1212           }
1213         }
1214       }
1215     }
1216   }
1217   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1218   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1219   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1220   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1221   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1222 
1223   /* Create the submatrices */
1224   if (scall == MAT_REUSE_MATRIX) {
1225     PetscTruth flag;
1226 
1227     /*
1228         Assumes new rows are same length as the old rows,hence bug!
1229     */
1230     for (i=0; i<ismax; i++) {
1231       mat = (Mat_SeqAIJ *)(submats[i]->data);
1232       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) {
1233         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1234       }
1235       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1236       if (!flag) {
1237         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1238       }
1239       /* Initial matrix as if empty */
1240       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1241       submats[i]->factor = C->factor;
1242     }
1243   } else {
1244     for (i=0; i<ismax; i++) {
1245       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1246       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1247       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1248       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1249     }
1250   }
1251 
1252   /* Assemble the matrices */
1253   /* First assemble the local rows */
1254   {
1255     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1256     PetscScalar *imat_a;
1257 
1258     for (i=0; i<ismax; i++) {
1259       mat       = (Mat_SeqAIJ*)submats[i]->data;
1260       imat_ilen = mat->ilen;
1261       imat_j    = mat->j;
1262       imat_i    = mat->i;
1263       imat_a    = mat->a;
1264       cmap_i    = cmap[i];
1265       rmap_i    = rmap[i];
1266       irow_i    = irow[i];
1267       jmax      = nrow[i];
1268       for (j=0; j<jmax; j++) {
1269 	l = 0;
1270         row      = irow_i[j];
1271         while (row >= C->rmap->range[l+1]) l++;
1272         proc = l;
1273         if (proc == rank) {
1274           old_row  = row;
1275           row      = rmap_i[row];
1276           ilen_row = imat_ilen[row];
1277           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1278           mat_i    = imat_i[row] ;
1279           mat_a    = imat_a + mat_i;
1280           mat_j    = imat_j + mat_i;
1281           for (k=0; k<ncols; k++) {
1282             if ((tcol = cmap_i[cols[k]])) {
1283               *mat_j++ = tcol - 1;
1284               *mat_a++ = vals[k];
1285               ilen_row++;
1286             }
1287           }
1288           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1289           imat_ilen[row] = ilen_row;
1290         }
1291       }
1292     }
1293   }
1294 
1295   /*   Now assemble the off proc rows*/
1296   {
1297     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1298     PetscInt    *imat_j,*imat_i;
1299     PetscScalar *imat_a,*rbuf4_i;
1300 
1301     for (tmp2=0; tmp2<nrqs; tmp2++) {
1302       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1303       idex   = pa[idex2];
1304       sbuf1_i = sbuf1[idex];
1305       jmax    = sbuf1_i[0];
1306       ct1     = 2*jmax + 1;
1307       ct2     = 0;
1308       rbuf2_i = rbuf2[idex2];
1309       rbuf3_i = rbuf3[idex2];
1310       rbuf4_i = rbuf4[idex2];
1311       for (j=1; j<=jmax; j++) {
1312         is_no     = sbuf1_i[2*j-1];
1313         rmap_i    = rmap[is_no];
1314         cmap_i    = cmap[is_no];
1315         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1316         imat_ilen = mat->ilen;
1317         imat_j    = mat->j;
1318         imat_i    = mat->i;
1319         imat_a    = mat->a;
1320         max1      = sbuf1_i[2*j];
1321         for (k=0; k<max1; k++,ct1++) {
1322           row   = sbuf1_i[ct1];
1323           row   = rmap_i[row];
1324           ilen  = imat_ilen[row];
1325           mat_i = imat_i[row] ;
1326           mat_a = imat_a + mat_i;
1327           mat_j = imat_j + mat_i;
1328           max2 = rbuf2_i[ct1];
1329           for (l=0; l<max2; l++,ct2++) {
1330             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1331               *mat_j++ = tcol - 1;
1332               *mat_a++ = rbuf4_i[ct2];
1333               ilen++;
1334             }
1335           }
1336           imat_ilen[row] = ilen;
1337         }
1338       }
1339     }
1340   }
1341   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1342   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1343   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1344   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1345   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1346 
1347   /* Restore the indices */
1348   for (i=0; i<ismax; i++) {
1349     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1350     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1351   }
1352 
1353   /* Destroy allocated memory */
1354   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1355   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1356   ierr = PetscFree(pa);CHKERRQ(ierr);
1357 
1358   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1359   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1360   for (i=0; i<nrqr; ++i) {
1361     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1362   }
1363   for (i=0; i<nrqs; ++i) {
1364     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1365     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1366   }
1367 
1368   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1369   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1370   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1371   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1372   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1373   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1374   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1375 
1376   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1377   ierr = PetscFree(cmap);CHKERRQ(ierr);
1378   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1379   ierr = PetscFree(rmap);CHKERRQ(ierr);
1380   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1381   ierr = PetscFree(lens);CHKERRQ(ierr);
1382 
1383   for (i=0; i<ismax; i++) {
1384     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1385     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1386   }
1387   PetscFunctionReturn(0);
1388 }
1389 
1390