xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 72ce74bd30d5d09d1371a3eb387704f4af395d1a)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include "src/mat/impls/aij/mpi/mpiaij.h"
6 #include "petscbt.h"
7 
8 static int MatIncreaseOverlap_MPIAIJ_Once(Mat,int,IS *);
9 static int MatIncreaseOverlap_MPIAIJ_Local(Mat,int,char **,int*,int**);
10 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat,int,int **,int**,int*);
11 EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
12 EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
13 
14 #undef __FUNCT__
15 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
16 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,int imax,IS is[],int ov)
17 {
18   int i,ierr;
19 
20   PetscFunctionBegin;
21   if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
22   for (i=0; i<ov; ++i) {
23     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
24   }
25   PetscFunctionReturn(0);
26 }
27 
28 /*
29   Sample message format:
30   If a processor A wants processor B to process some elements corresponding
31   to index sets is[1],is[5]
32   mesg [0] = 2   (no of index sets in the mesg)
33   -----------
34   mesg [1] = 1 => is[1]
35   mesg [2] = sizeof(is[1]);
36   -----------
37   mesg [3] = 5  => is[5]
38   mesg [4] = sizeof(is[5]);
39   -----------
40   mesg [5]
41   mesg [n]  datas[1]
42   -----------
43   mesg[n+1]
44   mesg[m]  data(is[5])
45   -----------
46 
47   Notes:
48   nrqs - no of requests sent (or to be sent out)
49   nrqr - no of requests recieved (which have to be or which have been processed
50 */
51 #undef __FUNCT__
52 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
53 static int MatIncreaseOverlap_MPIAIJ_Once(Mat C,int imax,IS is[])
54 {
55   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
56   int         **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
57   int         size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
58   int         *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
59   int         *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
60   PetscBT     *table;
61   MPI_Comm    comm;
62   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
63   MPI_Status  *s_status,*recv_status;
64 
65   PetscFunctionBegin;
66   comm   = C->comm;
67   size   = c->size;
68   rank   = c->rank;
69   m      = C->M;
70 
71   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
72   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
73 
74   len    = (imax+1)*sizeof(int*)+ (imax + m)*sizeof(int);
75   ierr   = PetscMalloc(len,&idx);CHKERRQ(ierr);
76   n      = (int*)(idx + imax);
77   rtable = n + imax;
78 
79   for (i=0; i<imax; i++) {
80     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
81     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
82   }
83 
84   /* Create hash table for the mapping :row -> proc*/
85   for (i=0,j=0; i<size; i++) {
86     len = c->rowners[i+1];
87     for (; j<len; j++) {
88       rtable[j] = i;
89     }
90   }
91 
92   /* evaluate communication - mesg to who,length of mesg, and buffer space
93      required. Based on this, buffers are allocated, and data copied into them*/
94   ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr);/*  mesg size */
95   w2   = w1 + size;       /* if w2[i] marked, then a message to proc i*/
96   w3   = w2 + size;       /* no of IS that needs to be sent to proc i */
97   w4   = w3 + size;       /* temp work space used in determining w1, w2, w3 */
98   ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
99   for (i=0; i<imax; i++) {
100     ierr  = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
101     idx_i = idx[i];
102     len   = n[i];
103     for (j=0; j<len; j++) {
104       row  = idx_i[j];
105       if (row < 0) {
106         SETERRQ(1,"Index set cannot have negative entries");
107       }
108       proc = rtable[row];
109       w4[proc]++;
110     }
111     for (j=0; j<size; j++){
112       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
113     }
114   }
115 
116   nrqs     = 0;              /* no of outgoing messages */
117   msz      = 0;              /* total mesg length (for all proc */
118   w1[rank] = 0;              /* no mesg sent to intself */
119   w3[rank] = 0;
120   for (i=0; i<size; i++) {
121     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
122   }
123   /* pa - is list of processors to communicate with */
124   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr);
125   for (i=0,j=0; i<size; i++) {
126     if (w1[i]) {pa[j] = i; j++;}
127   }
128 
129   /* Each message would have a header = 1 + 2*(no of IS) + data */
130   for (i=0; i<nrqs; i++) {
131     j      = pa[i];
132     w1[j] += w2[j] + 2*w3[j];
133     msz   += w1[j];
134   }
135 
136   /* Determine the number of messages to expect, their lengths, from from-ids */
137   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
138   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
139 
140   /* Now post the Irecvs corresponding to these messages */
141   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
142 
143   /* Allocate Memory for outgoing messages */
144   len  = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
145   ierr = PetscMalloc(len,&outdat);CHKERRQ(ierr);
146   ptr  = outdat + size;     /* Pointers to the data in outgoing buffers */
147   ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr);
148   tmp  = (int*)(outdat + 2*size);
149   ctr  = tmp + msz;
150 
151   {
152     int *iptr = tmp,ict  = 0;
153     for (i=0; i<nrqs; i++) {
154       j         = pa[i];
155       iptr     +=  ict;
156       outdat[j] = iptr;
157       ict       = w1[j];
158     }
159   }
160 
161   /* Form the outgoing messages */
162   /*plug in the headers*/
163   for (i=0; i<nrqs; i++) {
164     j            = pa[i];
165     outdat[j][0] = 0;
166     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
167     ptr[j]       = outdat[j] + 2*w3[j] + 1;
168   }
169 
170   /* Memory for doing local proc's work*/
171   {
172     int  *d_p;
173     char *t_p;
174 
175     len   = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
176       (m)*imax*sizeof(int)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
177     ierr  = PetscMalloc(len,&table);CHKERRQ(ierr);
178     ierr  = PetscMemzero(table,len);CHKERRQ(ierr);
179     data  = (int **)(table + imax);
180     isz   = (int  *)(data  + imax);
181     d_p   = (int  *)(isz   + imax);
182     t_p   = (char *)(d_p   + m*imax);
183     for (i=0; i<imax; i++) {
184       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
185       data[i]  = d_p + (m)*i;
186     }
187   }
188 
189   /* Parse the IS and update local tables and the outgoing buf with the data*/
190   {
191     int     n_i,*data_i,isz_i,*outdat_j,ctr_j;
192     PetscBT table_i;
193 
194     for (i=0; i<imax; i++) {
195       ierr    = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
196       n_i     = n[i];
197       table_i = table[i];
198       idx_i   = idx[i];
199       data_i  = data[i];
200       isz_i   = isz[i];
201       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
202         row  = idx_i[j];
203         proc = rtable[row];
204         if (proc != rank) { /* copy to the outgoing buffer */
205           ctr[proc]++;
206           *ptr[proc] = row;
207           ptr[proc]++;
208         } else { /* Update the local table */
209           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
210         }
211       }
212       /* Update the headers for the current IS */
213       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
214         if ((ctr_j = ctr[j])) {
215           outdat_j        = outdat[j];
216           k               = ++outdat_j[0];
217           outdat_j[2*k]   = ctr_j;
218           outdat_j[2*k-1] = i;
219         }
220       }
221       isz[i] = isz_i;
222     }
223   }
224 
225 
226 
227   /*  Now  post the sends */
228   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
229   for (i=0; i<nrqs; ++i) {
230     j    = pa[i];
231     ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
232   }
233 
234   /* No longer need the original indices*/
235   for (i=0; i<imax; ++i) {
236     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
237   }
238   ierr = PetscFree(idx);CHKERRQ(ierr);
239 
240   for (i=0; i<imax; ++i) {
241     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
242   }
243 
244   /* Do Local work*/
245   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
246 
247   /* Receive messages*/
248   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
249   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);
250 
251   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
252   ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);
253 
254   /* Phase 1 sends are complete - deallocate buffers */
255   ierr = PetscFree(outdat);CHKERRQ(ierr);
256   ierr = PetscFree(w1);CHKERRQ(ierr);
257 
258   ierr = PetscMalloc((nrqr+1)*sizeof(int*),&xdata);CHKERRQ(ierr);
259   ierr = PetscMalloc((nrqr+1)*sizeof(int),&isz1);CHKERRQ(ierr);
260   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
261   ierr = PetscFree(rbuf);CHKERRQ(ierr);
262 
263 
264  /* Send the data back*/
265   /* Do a global reduction to know the buffer space req for incoming messages*/
266   {
267     int *rw1;
268 
269     ierr = PetscMalloc(size*sizeof(int),&rw1);CHKERRQ(ierr);
270     ierr = PetscMemzero(rw1,size*sizeof(int));CHKERRQ(ierr);
271 
272     for (i=0; i<nrqr; ++i) {
273       proc      = recv_status[i].MPI_SOURCE;
274       if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
275       rw1[proc] = isz1[i];
276     }
277     ierr = PetscFree(onodes1);CHKERRQ(ierr);
278     ierr = PetscFree(olengths1);CHKERRQ(ierr);
279 
280     /* Determine the number of messages to expect, their lengths, from from-ids */
281     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
282     PetscFree(rw1);
283   }
284   /* Now post the Irecvs corresponding to these messages */
285   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
286 
287   /*  Now  post the sends */
288   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
289   for (i=0; i<nrqr; ++i) {
290     j    = recv_status[i].MPI_SOURCE;
291     ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
292   }
293 
294   /* receive work done on other processors*/
295   {
296     int         idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
297     PetscBT     table_i;
298     MPI_Status  *status2;
299 
300     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
301     for (i=0; i<nrqs; ++i) {
302       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
303       /* Process the message*/
304       rbuf2_i = rbuf2[idex];
305       ct1     = 2*rbuf2_i[0]+1;
306       jmax    = rbuf2[idex][0];
307       for (j=1; j<=jmax; j++) {
308         max     = rbuf2_i[2*j];
309         is_no   = rbuf2_i[2*j-1];
310         isz_i   = isz[is_no];
311         data_i  = data[is_no];
312         table_i = table[is_no];
313         for (k=0; k<max; k++,ct1++) {
314           row = rbuf2_i[ct1];
315           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
316         }
317         isz[is_no] = isz_i;
318       }
319     }
320 
321     ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);
322     ierr = PetscFree(status2);CHKERRQ(ierr);
323   }
324 
325   for (i=0; i<imax; ++i) {
326     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
327   }
328 
329   ierr = PetscFree(onodes2);CHKERRQ(ierr);
330   ierr = PetscFree(olengths2);CHKERRQ(ierr);
331 
332   ierr = PetscFree(pa);CHKERRQ(ierr);
333   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
334   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
335   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
337   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(table);CHKERRQ(ierr);
339   ierr = PetscFree(s_status);CHKERRQ(ierr);
340   ierr = PetscFree(recv_status);CHKERRQ(ierr);
341   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
342   ierr = PetscFree(xdata);CHKERRQ(ierr);
343   ierr = PetscFree(isz1);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
349 /*
350    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
351        the work on the local processor.
352 
353      Inputs:
354       C      - MAT_MPIAIJ;
355       imax - total no of index sets processed at a time;
356       table  - an array of char - size = m bits.
357 
358      Output:
359       isz    - array containing the count of the solution elements correspondign
360                to each index set;
361       data   - pointer to the solutions
362 */
363 static int MatIncreaseOverlap_MPIAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
364 {
365   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
366   Mat        A = c->A,B = c->B;
367   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
368   int        start,end,val,max,rstart,cstart,*ai,*aj;
369   int        *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
370   PetscBT    table_i;
371 
372   PetscFunctionBegin;
373   rstart = c->rstart;
374   cstart = c->cstart;
375   ai     = a->i;
376   aj     = a->j;
377   bi     = b->i;
378   bj     = b->j;
379   garray = c->garray;
380 
381 
382   for (i=0; i<imax; i++) {
383     data_i  = data[i];
384     table_i = table[i];
385     isz_i   = isz[i];
386     for (j=0,max=isz[i]; j<max; j++) {
387       row   = data_i[j] - rstart;
388       start = ai[row];
389       end   = ai[row+1];
390       for (k=start; k<end; k++) { /* Amat */
391         val = aj[k] + cstart;
392         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
393       }
394       start = bi[row];
395       end   = bi[row+1];
396       for (k=start; k<end; k++) { /* Bmat */
397         val = garray[bj[k]];
398         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399       }
400     }
401     isz[i] = isz_i;
402   }
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
408 /*
409       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
410          and return the output
411 
412          Input:
413            C    - the matrix
414            nrqr - no of messages being processed.
415            rbuf - an array of pointers to the recieved requests
416 
417          Output:
418            xdata - array of messages to be sent back
419            isz1  - size of each message
420 
421   For better efficiency perhaps we should malloc seperately each xdata[i],
422 then if a remalloc is required we need only copy the data for that one row
423 rather then all previous rows as it is now where a single large chunck of
424 memory is used.
425 
426 */
427 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
428 {
429   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
430   Mat         A = c->A,B = c->B;
431   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
432   int         rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
433   int         row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
434   int         val,max1,max2,rank,m,no_malloc =0,*tmp,new_estimate,ctr;
435   int         *rbuf_i,kmax,rbuf_0,ierr;
436   PetscBT     xtable;
437 
438   PetscFunctionBegin;
439   rank   = c->rank;
440   m      = C->M;
441   rstart = c->rstart;
442   cstart = c->cstart;
443   ai     = a->i;
444   aj     = a->j;
445   bi     = b->i;
446   bj     = b->j;
447   garray = c->garray;
448 
449 
450   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
451     rbuf_i  =  rbuf[i];
452     rbuf_0  =  rbuf_i[0];
453     ct     += rbuf_0;
454     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
455   }
456 
457   if (C->m) max1 = ct*(a->nz + b->nz)/C->m;
458   else      max1 = 1;
459   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
460   ierr         = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr);
461   ++no_malloc;
462   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
463   ierr         = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr);
464 
465   ct3 = 0;
466   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
467     rbuf_i =  rbuf[i];
468     rbuf_0 =  rbuf_i[0];
469     ct1    =  2*rbuf_0+1;
470     ct2    =  ct1;
471     ct3    += ct1;
472     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
473       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
474       oct2 = ct2;
475       kmax = rbuf_i[2*j];
476       for (k=0; k<kmax; k++,ct1++) {
477         row = rbuf_i[ct1];
478         if (!PetscBTLookupSet(xtable,row)) {
479           if (!(ct3 < mem_estimate)) {
480             new_estimate = (int)(1.5*mem_estimate)+1;
481             ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
482             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
483             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
484             xdata[0]     = tmp;
485             mem_estimate = new_estimate; ++no_malloc;
486             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
487           }
488           xdata[i][ct2++] = row;
489           ct3++;
490         }
491       }
492       for (k=oct2,max2=ct2; k<max2; k++) {
493         row   = xdata[i][k] - rstart;
494         start = ai[row];
495         end   = ai[row+1];
496         for (l=start; l<end; l++) {
497           val = aj[l] + cstart;
498           if (!PetscBTLookupSet(xtable,val)) {
499             if (!(ct3 < mem_estimate)) {
500               new_estimate = (int)(1.5*mem_estimate)+1;
501               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
502               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
503               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
504               xdata[0]     = tmp;
505               mem_estimate = new_estimate; ++no_malloc;
506               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
507             }
508             xdata[i][ct2++] = val;
509             ct3++;
510           }
511         }
512         start = bi[row];
513         end   = bi[row+1];
514         for (l=start; l<end; l++) {
515           val = garray[bj[l]];
516           if (!PetscBTLookupSet(xtable,val)) {
517             if (!(ct3 < mem_estimate)) {
518               new_estimate = (int)(1.5*mem_estimate)+1;
519               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
520               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
521               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
522               xdata[0]     = tmp;
523               mem_estimate = new_estimate; ++no_malloc;
524               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
525             }
526             xdata[i][ct2++] = val;
527             ct3++;
528           }
529         }
530       }
531       /* Update the header*/
532       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
533       xdata[i][2*j-1] = rbuf_i[2*j-1];
534     }
535     xdata[i][0] = rbuf_0;
536     xdata[i+1]  = xdata[i] + ct2;
537     isz1[i]     = ct2; /* size of each message */
538   }
539   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
540   PetscLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc);
541   PetscFunctionReturn(0);
542 }
543 /* -------------------------------------------------------------------------*/
544 EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,int,const IS[],const IS[],MatReuse,Mat*);
545 EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
546 /*
547     Every processor gets the entire matrix
548 */
549 #undef __FUNCT__
550 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
551 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat *Bin[])
552 {
553   Mat          B;
554   Mat_MPIAIJ   *a = (Mat_MPIAIJ *)A->data;
555   Mat_SeqAIJ   *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
556   PetscErrorCode ierr;
557   int          sendcount,*recvcounts = 0,*displs = 0,size,i,*rstarts = a->rowners,rank,n,cnt,j;
558   int          m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
559   PetscScalar  *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
560 
561   PetscFunctionBegin;
562   ierr = MPI_Comm_size(A->comm,&size);CHKERRQ(ierr);
563   ierr = MPI_Comm_rank(A->comm,&rank);CHKERRQ(ierr);
564 
565   if (scall == MAT_INITIAL_MATRIX) {
566     /* ----------------------------------------------------------------
567          Tell every processor the number of nonzeros per row
568     */
569     ierr = PetscMalloc(A->M*sizeof(int),&lens);CHKERRQ(ierr);
570     for (i=a->rstart; i<a->rend; i++) {
571       lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart];
572     }
573     sendcount = a->rend - a->rstart;
574     ierr = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
575     displs     = recvcounts + size;
576     for (i=0; i<size; i++) {
577       recvcounts[i] = a->rowners[i+1] - a->rowners[i];
578       displs[i]     = a->rowners[i];
579     }
580     ierr  = MPI_Allgatherv(lens+a->rstart,sendcount,MPI_INT,lens,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
581 
582     /* ---------------------------------------------------------------
583          Create the sequential matrix of the same type as the local block diagonal
584     */
585     ierr  = MatCreate(PETSC_COMM_SELF,A->M,A->N,PETSC_DETERMINE,PETSC_DETERMINE,&B);CHKERRQ(ierr);
586     ierr  = MatSetType(B,a->A->type_name);CHKERRQ(ierr);
587     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
588     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
589     **Bin = B;
590     b = (Mat_SeqAIJ *)B->data;
591 
592     /*--------------------------------------------------------------------
593        Copy my part of matrix column indices over
594     */
595     sendcount  = ad->nz + bd->nz;
596     jsendbuf   = b->j + b->i[rstarts[rank]];
597     a_jsendbuf = ad->j;
598     b_jsendbuf = bd->j;
599     n          = a->rend - a->rstart;
600     cnt        = 0;
601     for (i=0; i<n; i++) {
602 
603       /* put in lower diagonal portion */
604       m = bd->i[i+1] - bd->i[i];
605       while (m > 0) {
606         /* is it above diagonal (in bd (compressed) numbering) */
607         if (garray[*b_jsendbuf] > a->rstart + i) break;
608         jsendbuf[cnt++] = garray[*b_jsendbuf++];
609         m--;
610       }
611 
612       /* put in diagonal portion */
613       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
614         jsendbuf[cnt++] = a->rstart + *a_jsendbuf++;
615       }
616 
617       /* put in upper diagonal portion */
618       while (m-- > 0) {
619         jsendbuf[cnt++] = garray[*b_jsendbuf++];
620       }
621     }
622     if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
623 
624     /*--------------------------------------------------------------------
625        Gather all column indices to all processors
626     */
627     for (i=0; i<size; i++) {
628       recvcounts[i] = 0;
629       for (j=a->rowners[i]; j<a->rowners[i+1]; j++) {
630         recvcounts[i] += lens[j];
631       }
632     }
633     displs[0]  = 0;
634     for (i=1; i<size; i++) {
635       displs[i] = displs[i-1] + recvcounts[i-1];
636     }
637     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPI_INT,b->j,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
638 
639     /*--------------------------------------------------------------------
640         Assemble the matrix into useable form (note numerical values not yet set)
641     */
642     /* set the b->ilen (length of each row) values */
643     ierr = PetscMemcpy(b->ilen,lens,A->M*sizeof(int));CHKERRQ(ierr);
644     /* set the b->i indices */
645     b->i[0] = 0;
646     for (i=1; i<=A->M; i++) {
647       b->i[i] = b->i[i-1] + lens[i-1];
648     }
649     ierr = PetscFree(lens);CHKERRQ(ierr);
650     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
651     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
652 
653   } else {
654     B  = **Bin;
655     b = (Mat_SeqAIJ *)B->data;
656   }
657 
658   /*--------------------------------------------------------------------
659        Copy my part of matrix numerical values into the values location
660   */
661   sendcount = ad->nz + bd->nz;
662   sendbuf   = b->a + b->i[rstarts[rank]];
663   a_sendbuf = ad->a;
664   b_sendbuf = bd->a;
665   b_sendj   = bd->j;
666   n         = a->rend - a->rstart;
667   cnt       = 0;
668   for (i=0; i<n; i++) {
669 
670     /* put in lower diagonal portion */
671     m = bd->i[i+1] - bd->i[i];
672     while (m > 0) {
673       /* is it above diagonal (in bd (compressed) numbering) */
674       if (garray[*b_sendj] > a->rstart + i) break;
675       sendbuf[cnt++] = *b_sendbuf++;
676       m--;
677       b_sendj++;
678     }
679 
680     /* put in diagonal portion */
681     for (j=ad->i[i]; j<ad->i[i+1]; j++) {
682       sendbuf[cnt++] = *a_sendbuf++;
683     }
684 
685     /* put in upper diagonal portion */
686     while (m-- > 0) {
687       sendbuf[cnt++] = *b_sendbuf++;
688       b_sendj++;
689     }
690   }
691   if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
692 
693   /* -----------------------------------------------------------------
694      Gather all numerical values to all processors
695   */
696   if (!recvcounts) {
697     ierr   = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
698     displs = recvcounts + size;
699   }
700   for (i=0; i<size; i++) {
701     recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
702   }
703   displs[0]  = 0;
704   for (i=1; i<size; i++) {
705     displs[i] = displs[i-1] + recvcounts[i-1];
706   }
707   recvbuf   = b->a;
708   ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);CHKERRQ(ierr);
709   ierr = PetscFree(recvcounts);CHKERRQ(ierr);
710   if (A->symmetric){
711     ierr = MatSetOption(B,MAT_SYMMETRIC);CHKERRQ(ierr);
712   } else if (A->hermitian) {
713     ierr = MatSetOption(B,MAT_HERMITIAN);CHKERRQ(ierr);
714   } else if (A->structurally_symmetric) {
715     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC);CHKERRQ(ierr);
716   }
717 
718   PetscFunctionReturn(0);
719 }
720 
721 #undef __FUNCT__
722 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
723 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
724 {
725   int         nmax,nstages_local,nstages,i,pos,max_no,ierr,nrow,ncol;
726   PetscTruth  rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
727 
728   PetscFunctionBegin;
729   /*
730        Check for special case each processor gets entire matrix
731   */
732   if (ismax == 1 && C->M == C->N) {
733     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
734     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
735     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
736     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
737     if (rowflag && colflag && nrow == C->M && ncol == C->N) {
738       wantallmatrix = PETSC_TRUE;
739       ierr = PetscOptionsGetLogical(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
740     }
741   }
742   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);CHKERRQ(ierr);
743   if (twantallmatrix) {
744     ierr = MatGetSubMatrix_MPIAIJ_All(C,scall,submat);CHKERRQ(ierr);
745     PetscFunctionReturn(0);
746   }
747 
748   /* Allocate memory to hold all the submatrices */
749   if (scall != MAT_REUSE_MATRIX) {
750     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
751   }
752   /* Determine the number of stages through which submatrices are done */
753   nmax          = 20*1000000 / (C->N * sizeof(int));
754   if (!nmax) nmax = 1;
755   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
756 
757   /* Make sure every processor loops through the nstages */
758   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr);
759 
760   for (i=0,pos=0; i<nstages; i++) {
761     if (pos+nmax <= ismax) max_no = nmax;
762     else if (pos == ismax) max_no = 0;
763     else                   max_no = ismax-pos;
764     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
765     pos += max_no;
766   }
767   PetscFunctionReturn(0);
768 }
769 /* -------------------------------------------------------------------------*/
770 #undef __FUNCT__
771 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
772 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
773 {
774   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
775   Mat         A = c->A;
776   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
777   int         **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
778   int         **sbuf1,**sbuf2,rank,m,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
779   int         nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
780   int         **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
781   int         **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
782   int         len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
783   int         *rmap_i,tag0,tag1,tag2,tag3;
784   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
785   MPI_Request *r_waits4,*s_waits3,*s_waits4;
786   MPI_Status  *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
787   MPI_Status  *r_status3,*r_status4,*s_status4;
788   MPI_Comm    comm;
789   PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
790   PetscTruth  sorted;
791   int         *onodes1,*olengths1;
792 
793   PetscFunctionBegin;
794   comm   = C->comm;
795   tag0   = C->tag;
796   size   = c->size;
797   rank   = c->rank;
798   m      = C->M;
799 
800   /* Get some new tags to keep the communication clean */
801   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
802   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
803   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
804 
805     /* Check if the col indices are sorted */
806   for (i=0; i<ismax; i++) {
807     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
808     if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
809     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
810     /*    if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); */
811   }
812 
813   len    = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (m+1)*sizeof(int);
814   ierr   = PetscMalloc(len,&irow);CHKERRQ(ierr);
815   icol   = irow + ismax;
816   nrow   = (int*)(icol + ismax);
817   ncol   = nrow + ismax;
818   rtable = ncol + ismax;
819 
820   for (i=0; i<ismax; i++) {
821     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
822     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
823     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
824     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
825   }
826 
827   /* Create hash table for the mapping :row -> proc*/
828   for (i=0,j=0; i<size; i++) {
829     jmax = c->rowners[i+1];
830     for (; j<jmax; j++) {
831       rtable[j] = i;
832     }
833   }
834 
835   /* evaluate communication - mesg to who, length of mesg, and buffer space
836      required. Based on this, buffers are allocated, and data copied into them*/
837   ierr   = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr); /* mesg size */
838   w2     = w1 + size;      /* if w2[i] marked, then a message to proc i*/
839   w3     = w2 + size;      /* no of IS that needs to be sent to proc i */
840   w4     = w3 + size;      /* temp work space used in determining w1, w2, w3 */
841   ierr   = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
842   for (i=0; i<ismax; i++) {
843     ierr   = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
844     jmax   = nrow[i];
845     irow_i = irow[i];
846     for (j=0; j<jmax; j++) {
847       row  = irow_i[j];
848       proc = rtable[row];
849       w4[proc]++;
850     }
851     for (j=0; j<size; j++) {
852       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
853     }
854   }
855 
856   nrqs     = 0;              /* no of outgoing messages */
857   msz      = 0;              /* total mesg length (for all procs) */
858   w1[rank] = 0;              /* no mesg sent to self */
859   w3[rank] = 0;
860   for (i=0; i<size; i++) {
861     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
862   }
863   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); /*(proc -array)*/
864   for (i=0,j=0; i<size; i++) {
865     if (w1[i]) { pa[j] = i; j++; }
866   }
867 
868   /* Each message would have a header = 1 + 2*(no of IS) + data */
869   for (i=0; i<nrqs; i++) {
870     j     = pa[i];
871     w1[j] += w2[j] + 2* w3[j];
872     msz   += w1[j];
873   }
874 
875   /* Determine the number of messages to expect, their lengths, from from-ids */
876   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
877   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
878 
879   /* Now post the Irecvs corresponding to these messages */
880   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
881 
882   ierr = PetscFree(onodes1);CHKERRQ(ierr);
883   ierr = PetscFree(olengths1);CHKERRQ(ierr);
884 
885   /* Allocate Memory for outgoing messages */
886   len      = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
887   ierr     = PetscMalloc(len,&sbuf1);CHKERRQ(ierr);
888   ptr      = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
889   ierr     = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr);
890   /* allocate memory for outgoing data + buf to receive the first reply */
891   tmp      = (int*)(ptr + size);
892   ctr      = tmp + 2*msz;
893 
894   {
895     int *iptr = tmp,ict = 0;
896     for (i=0; i<nrqs; i++) {
897       j         = pa[i];
898       iptr     += ict;
899       sbuf1[j]  = iptr;
900       ict       = w1[j];
901     }
902   }
903 
904   /* Form the outgoing messages */
905   /* Initialize the header space */
906   for (i=0; i<nrqs; i++) {
907     j           = pa[i];
908     sbuf1[j][0] = 0;
909     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
910     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
911   }
912 
913   /* Parse the isrow and copy data into outbuf */
914   for (i=0; i<ismax; i++) {
915     ierr   = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
916     irow_i = irow[i];
917     jmax   = nrow[i];
918     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
919       row  = irow_i[j];
920       proc = rtable[row];
921       if (proc != rank) { /* copy to the outgoing buf*/
922         ctr[proc]++;
923         *ptr[proc] = row;
924         ptr[proc]++;
925       }
926     }
927     /* Update the headers for the current IS */
928     for (j=0; j<size; j++) { /* Can Optimise this loop too */
929       if ((ctr_j = ctr[j])) {
930         sbuf1_j        = sbuf1[j];
931         k              = ++sbuf1_j[0];
932         sbuf1_j[2*k]   = ctr_j;
933         sbuf1_j[2*k-1] = i;
934       }
935     }
936   }
937 
938   /*  Now  post the sends */
939   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
940   for (i=0; i<nrqs; ++i) {
941     j    = pa[i];
942     ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
943   }
944 
945   /* Post Receives to capture the buffer size */
946   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
947   ierr     = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf2);CHKERRQ(ierr);
948   rbuf2[0] = tmp + msz;
949   for (i=1; i<nrqs; ++i) {
950     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
951   }
952   for (i=0; i<nrqs; ++i) {
953     j    = pa[i];
954     ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
955   }
956 
957   /* Send to other procs the buf size they should allocate */
958 
959 
960   /* Receive messages*/
961   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
962   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
963   len         = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
964   ierr        = PetscMalloc(len,&sbuf2);CHKERRQ(ierr);
965   req_size    = (int*)(sbuf2 + nrqr);
966   req_source  = req_size + nrqr;
967 
968   {
969     Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
970     int        *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart;
971     int        *sbuf2_i;
972 
973     for (i=0; i<nrqr; ++i) {
974       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
975       req_size[idex] = 0;
976       rbuf1_i         = rbuf1[idex];
977       start           = 2*rbuf1_i[0] + 1;
978       ierr            = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr);
979       ierr            = PetscMalloc((end+1)*sizeof(int),&sbuf2[idex]);CHKERRQ(ierr);
980       sbuf2_i         = sbuf2[idex];
981       for (j=start; j<end; j++) {
982         id               = rbuf1_i[j] - rstart;
983         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
984         sbuf2_i[j]       = ncols;
985         req_size[idex] += ncols;
986       }
987       req_source[idex] = r_status1[i].MPI_SOURCE;
988       /* form the header */
989       sbuf2_i[0]   = req_size[idex];
990       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
991       ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
992     }
993   }
994   ierr = PetscFree(r_status1);CHKERRQ(ierr);
995   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
996 
997   /*  recv buffer sizes */
998   /* Receive messages*/
999 
1000   ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);CHKERRQ(ierr);
1001   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1002   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1003   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1004   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1005 
1006   for (i=0; i<nrqs; ++i) {
1007     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1008     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(int),&rbuf3[idex]);CHKERRQ(ierr);
1009     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1010     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1011     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1012   }
1013   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1014   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1015 
1016   /* Wait on sends1 and sends2 */
1017   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1018   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1019 
1020   ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1021   ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1022   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1023   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1024   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1025   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1026 
1027   /* Now allocate buffers for a->j, and send them off */
1028   ierr = PetscMalloc((nrqr+1)*sizeof(int*),&sbuf_aj);CHKERRQ(ierr);
1029   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1030   ierr = PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);CHKERRQ(ierr);
1031   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1032 
1033   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1034   {
1035     int nzA,nzB,*a_i = a->i,*b_i = b->i,imark;
1036     int *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1037     int *a_j = a->j,*b_j = b->j,ctmp;
1038 
1039     for (i=0; i<nrqr; i++) {
1040       rbuf1_i   = rbuf1[i];
1041       sbuf_aj_i = sbuf_aj[i];
1042       ct1       = 2*rbuf1_i[0] + 1;
1043       ct2       = 0;
1044       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1045         kmax = rbuf1[i][2*j];
1046         for (k=0; k<kmax; k++,ct1++) {
1047           row    = rbuf1_i[ct1] - rstart;
1048           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1049           ncols  = nzA + nzB;
1050           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1051 
1052           /* load the column indices for this row into cols*/
1053           cols  = sbuf_aj_i + ct2;
1054 
1055           for (l=0; l<nzB; l++) {
1056             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
1057             else break;
1058           }
1059           imark = l;
1060           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
1061           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1062 
1063           ct2 += ncols;
1064         }
1065       }
1066       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1067     }
1068   }
1069   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1070   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1071 
1072   /* Allocate buffers for a->a, and send them off */
1073   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1074   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1075   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1076   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1077 
1078   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1079   {
1080     int    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,imark;
1081     int    cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1082     int    *b_j = b->j;
1083     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1084 
1085     for (i=0; i<nrqr; i++) {
1086       rbuf1_i   = rbuf1[i];
1087       sbuf_aa_i = sbuf_aa[i];
1088       ct1       = 2*rbuf1_i[0]+1;
1089       ct2       = 0;
1090       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1091         kmax = rbuf1_i[2*j];
1092         for (k=0; k<kmax; k++,ct1++) {
1093           row    = rbuf1_i[ct1] - rstart;
1094           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1095           ncols  = nzA + nzB;
1096           cworkB = b_j + b_i[row];
1097           vworkA = a_a + a_i[row];
1098           vworkB = b_a + b_i[row];
1099 
1100           /* load the column values for this row into vals*/
1101           vals  = sbuf_aa_i+ct2;
1102 
1103           for (l=0; l<nzB; l++) {
1104             if ((bmap[cworkB[l]]) < cstart)  vals[l] = vworkB[l];
1105             else break;
1106           }
1107           imark = l;
1108           for (l=0; l<nzA; l++)   vals[imark+l] = vworkA[l];
1109           for (l=imark; l<nzB; l++) vals[nzA+l] = vworkB[l];
1110 
1111           ct2 += ncols;
1112         }
1113       }
1114       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1115     }
1116   }
1117   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1118   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1119   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1120 
1121   /* Form the matrix */
1122   /* create col map */
1123   {
1124     int *icol_i;
1125 
1126     len     = (1+ismax)*sizeof(int*)+ ismax*C->N*sizeof(int);
1127     ierr    = PetscMalloc(len,&cmap);CHKERRQ(ierr);
1128     cmap[0] = (int*)(cmap + ismax);
1129     ierr    = PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(int));CHKERRQ(ierr);
1130     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->N; }
1131     for (i=0; i<ismax; i++) {
1132       jmax   = ncol[i];
1133       icol_i = icol[i];
1134       cmap_i = cmap[i];
1135       for (j=0; j<jmax; j++) {
1136         cmap_i[icol_i[j]] = j+1;
1137       }
1138     }
1139   }
1140 
1141   /* Create lens which is required for MatCreate... */
1142   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1143   len     = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1144   ierr    = PetscMalloc(len,&lens);CHKERRQ(ierr);
1145   lens[0] = (int*)(lens + ismax);
1146   ierr    = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr);
1147   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1148 
1149   /* Update lens from local data */
1150   for (i=0; i<ismax; i++) {
1151     jmax   = nrow[i];
1152     cmap_i = cmap[i];
1153     irow_i = irow[i];
1154     lens_i = lens[i];
1155     for (j=0; j<jmax; j++) {
1156       row  = irow_i[j];
1157       proc = rtable[row];
1158       if (proc == rank) {
1159         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1160         for (k=0; k<ncols; k++) {
1161           if (cmap_i[cols[k]]) { lens_i[j]++;}
1162         }
1163         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1164       }
1165     }
1166   }
1167 
1168   /* Create row map*/
1169   len     = (1+ismax)*sizeof(int*)+ ismax*C->M*sizeof(int);
1170   ierr    = PetscMalloc(len,&rmap);CHKERRQ(ierr);
1171   rmap[0] = (int*)(rmap + ismax);
1172   ierr    = PetscMemzero(rmap[0],ismax*C->M*sizeof(int));CHKERRQ(ierr);
1173   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->M;}
1174   for (i=0; i<ismax; i++) {
1175     rmap_i = rmap[i];
1176     irow_i = irow[i];
1177     jmax   = nrow[i];
1178     for (j=0; j<jmax; j++) {
1179       rmap_i[irow_i[j]] = j;
1180     }
1181   }
1182 
1183   /* Update lens from offproc data */
1184   {
1185     int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1186 
1187     for (tmp2=0; tmp2<nrqs; tmp2++) {
1188       ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr);
1189       idex   = pa[i];
1190       sbuf1_i = sbuf1[idex];
1191       jmax    = sbuf1_i[0];
1192       ct1     = 2*jmax+1;
1193       ct2     = 0;
1194       rbuf2_i = rbuf2[i];
1195       rbuf3_i = rbuf3[i];
1196       for (j=1; j<=jmax; j++) {
1197         is_no   = sbuf1_i[2*j-1];
1198         max1    = sbuf1_i[2*j];
1199         lens_i  = lens[is_no];
1200         cmap_i  = cmap[is_no];
1201         rmap_i  = rmap[is_no];
1202         for (k=0; k<max1; k++,ct1++) {
1203           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1204           max2 = rbuf2_i[ct1];
1205           for (l=0; l<max2; l++,ct2++) {
1206             if (cmap_i[rbuf3_i[ct2]]) {
1207               lens_i[row]++;
1208             }
1209           }
1210         }
1211       }
1212     }
1213   }
1214   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1215   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1216   ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1217   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1218   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1219 
1220   /* Create the submatrices */
1221   if (scall == MAT_REUSE_MATRIX) {
1222     PetscTruth flag;
1223 
1224     /*
1225         Assumes new rows are same length as the old rows,hence bug!
1226     */
1227     for (i=0; i<ismax; i++) {
1228       mat = (Mat_SeqAIJ *)(submats[i]->data);
1229       if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) {
1230         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1231       }
1232       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(int),&flag);CHKERRQ(ierr);
1233       if (flag == PETSC_FALSE) {
1234         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1235       }
1236       /* Initial matrix as if empty */
1237       ierr = PetscMemzero(mat->ilen,submats[i]->m*sizeof(int));CHKERRQ(ierr);
1238       submats[i]->factor = C->factor;
1239     }
1240   } else {
1241     for (i=0; i<ismax; i++) {
1242       ierr = MatCreate(PETSC_COMM_SELF,nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE,submats+i);CHKERRQ(ierr);
1243       ierr = MatSetType(submats[i],A->type_name);CHKERRQ(ierr);
1244       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1245     }
1246   }
1247 
1248   /* Assemble the matrices */
1249   /* First assemble the local rows */
1250   {
1251     int    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1252     PetscScalar *imat_a;
1253 
1254     for (i=0; i<ismax; i++) {
1255       mat       = (Mat_SeqAIJ*)submats[i]->data;
1256       imat_ilen = mat->ilen;
1257       imat_j    = mat->j;
1258       imat_i    = mat->i;
1259       imat_a    = mat->a;
1260       cmap_i    = cmap[i];
1261       rmap_i    = rmap[i];
1262       irow_i    = irow[i];
1263       jmax      = nrow[i];
1264       for (j=0; j<jmax; j++) {
1265         row      = irow_i[j];
1266         proc     = rtable[row];
1267         if (proc == rank) {
1268           old_row  = row;
1269           row      = rmap_i[row];
1270           ilen_row = imat_ilen[row];
1271           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1272           mat_i    = imat_i[row] ;
1273           mat_a    = imat_a + mat_i;
1274           mat_j    = imat_j + mat_i;
1275           for (k=0; k<ncols; k++) {
1276             if ((tcol = cmap_i[cols[k]])) {
1277               *mat_j++ = tcol - 1;
1278               *mat_a++ = vals[k];
1279               ilen_row++;
1280             }
1281           }
1282           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1283           imat_ilen[row] = ilen_row;
1284         }
1285       }
1286     }
1287   }
1288 
1289   /*   Now assemble the off proc rows*/
1290   {
1291     int    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1292     int    *imat_j,*imat_i;
1293     PetscScalar *imat_a,*rbuf4_i;
1294 
1295     for (tmp2=0; tmp2<nrqs; tmp2++) {
1296       ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr);
1297       idex   = pa[i];
1298       sbuf1_i = sbuf1[idex];
1299       jmax    = sbuf1_i[0];
1300       ct1     = 2*jmax + 1;
1301       ct2     = 0;
1302       rbuf2_i = rbuf2[i];
1303       rbuf3_i = rbuf3[i];
1304       rbuf4_i = rbuf4[i];
1305       for (j=1; j<=jmax; j++) {
1306         is_no     = sbuf1_i[2*j-1];
1307         rmap_i    = rmap[is_no];
1308         cmap_i    = cmap[is_no];
1309         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1310         imat_ilen = mat->ilen;
1311         imat_j    = mat->j;
1312         imat_i    = mat->i;
1313         imat_a    = mat->a;
1314         max1      = sbuf1_i[2*j];
1315         for (k=0; k<max1; k++,ct1++) {
1316           row   = sbuf1_i[ct1];
1317           row   = rmap_i[row];
1318           ilen  = imat_ilen[row];
1319           mat_i = imat_i[row] ;
1320           mat_a = imat_a + mat_i;
1321           mat_j = imat_j + mat_i;
1322           max2 = rbuf2_i[ct1];
1323           for (l=0; l<max2; l++,ct2++) {
1324             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1325               *mat_j++ = tcol - 1;
1326               *mat_a++ = rbuf4_i[ct2];
1327               ilen++;
1328             }
1329           }
1330           imat_ilen[row] = ilen;
1331         }
1332       }
1333     }
1334   }
1335   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1336   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1337   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1338   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1339   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1340 
1341   /* Restore the indices */
1342   for (i=0; i<ismax; i++) {
1343     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1344     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1345   }
1346 
1347   /* Destroy allocated memory */
1348   ierr = PetscFree(irow);CHKERRQ(ierr);
1349   ierr = PetscFree(w1);CHKERRQ(ierr);
1350   ierr = PetscFree(pa);CHKERRQ(ierr);
1351 
1352   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1353   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1354   for (i=0; i<nrqr; ++i) {
1355     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1356   }
1357   for (i=0; i<nrqs; ++i) {
1358     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1359     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1360   }
1361 
1362   ierr = PetscFree(sbuf2);CHKERRQ(ierr);
1363   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1364   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1365   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1366   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1367   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1368   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1369 
1370   ierr = PetscFree(cmap);CHKERRQ(ierr);
1371   ierr = PetscFree(rmap);CHKERRQ(ierr);
1372   ierr = PetscFree(lens);CHKERRQ(ierr);
1373 
1374   for (i=0; i<ismax; i++) {
1375     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1376     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1377   }
1378   PetscFunctionReturn(0);
1379 }
1380 
1381