xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 634064b45b5c838063ae82f97ffb7e99245dcdb5)
1 /*
2    Routines to compute overlapping regions of a parallel MPI matrix
3   and to find submatrices that were shared across processors.
4 */
5 #include "src/mat/impls/aij/mpi/mpiaij.h"
6 #include "petscbt.h"
7 
8 static int MatIncreaseOverlap_MPIAIJ_Once(Mat,int,IS *);
9 static int MatIncreaseOverlap_MPIAIJ_Local(Mat,int,char **,int*,int**);
10 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat,int,int **,int**,int*);
11 EXTERN int MatGetRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
12 EXTERN int MatRestoreRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
13 
14 #undef __FUNCT__
15 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
16 int MatIncreaseOverlap_MPIAIJ(Mat C,int imax,IS is[],int ov)
17 {
18   int i,ierr;
19 
20   PetscFunctionBegin;
21   if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
22   for (i=0; i<ov; ++i) {
23     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
24   }
25   PetscFunctionReturn(0);
26 }
27 
28 /*
29   Sample message format:
30   If a processor A wants processor B to process some elements corresponding
31   to index sets is[1],is[5]
32   mesg [0] = 2   (no of index sets in the mesg)
33   -----------
34   mesg [1] = 1 => is[1]
35   mesg [2] = sizeof(is[1]);
36   -----------
37   mesg [3] = 5  => is[5]
38   mesg [4] = sizeof(is[5]);
39   -----------
40   mesg [5]
41   mesg [n]  datas[1]
42   -----------
43   mesg[n+1]
44   mesg[m]  data(is[5])
45   -----------
46 
47   Notes:
48   nrqs - no of requests sent (or to be sent out)
49   nrqr - no of requests recieved (which have to be or which have been processed
50 */
51 #undef __FUNCT__
52 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
53 static int MatIncreaseOverlap_MPIAIJ_Once(Mat C,int imax,IS is[])
54 {
55   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
56   int         **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
57   int         size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
58   int         *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
59   int         *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
60   PetscBT     *table;
61   MPI_Comm    comm;
62   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
63   MPI_Status  *s_status,*recv_status;
64 
65   PetscFunctionBegin;
66   comm   = C->comm;
67   size   = c->size;
68   rank   = c->rank;
69   m      = C->M;
70 
71   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
72   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
73 
74   len    = (imax+1)*sizeof(int*)+ (imax + m)*sizeof(int);
75   ierr   = PetscMalloc(len,&idx);CHKERRQ(ierr);
76   n      = (int*)(idx + imax);
77   rtable = n + imax;
78 
79   for (i=0; i<imax; i++) {
80     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
81     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
82   }
83 
84   /* Create hash table for the mapping :row -> proc*/
85   for (i=0,j=0; i<size; i++) {
86     len = c->rowners[i+1];
87     for (; j<len; j++) {
88       rtable[j] = i;
89     }
90   }
91 
92   /* evaluate communication - mesg to who,length of mesg, and buffer space
93      required. Based on this, buffers are allocated, and data copied into them*/
94   ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr);/*  mesg size */
95   w2   = w1 + size;       /* if w2[i] marked, then a message to proc i*/
96   w3   = w2 + size;       /* no of IS that needs to be sent to proc i */
97   w4   = w3 + size;       /* temp work space used in determining w1, w2, w3 */
98   ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
99   for (i=0; i<imax; i++) {
100     ierr  = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
101     idx_i = idx[i];
102     len   = n[i];
103     for (j=0; j<len; j++) {
104       row  = idx_i[j];
105       if (row < 0) {
106         SETERRQ(1,"Index set cannot have negative entries");
107       }
108       proc = rtable[row];
109       w4[proc]++;
110     }
111     for (j=0; j<size; j++){
112       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
113     }
114   }
115 
116   nrqs     = 0;              /* no of outgoing messages */
117   msz      = 0;              /* total mesg length (for all proc */
118   w1[rank] = 0;              /* no mesg sent to intself */
119   w3[rank] = 0;
120   for (i=0; i<size; i++) {
121     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
122   }
123   /* pa - is list of processors to communicate with */
124   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr);
125   for (i=0,j=0; i<size; i++) {
126     if (w1[i]) {pa[j] = i; j++;}
127   }
128 
129   /* Each message would have a header = 1 + 2*(no of IS) + data */
130   for (i=0; i<nrqs; i++) {
131     j      = pa[i];
132     w1[j] += w2[j] + 2*w3[j];
133     msz   += w1[j];
134   }
135 
136   /* Determine the number of messages to expect, their lengths, from from-ids */
137   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
138   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
139 
140   /* Now post the Irecvs corresponding to these messages */
141   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
142 
143   /* Allocate Memory for outgoing messages */
144   len  = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
145   ierr = PetscMalloc(len,&outdat);CHKERRQ(ierr);
146   ptr  = outdat + size;     /* Pointers to the data in outgoing buffers */
147   ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr);
148   tmp  = (int*)(outdat + 2*size);
149   ctr  = tmp + msz;
150 
151   {
152     int *iptr = tmp,ict  = 0;
153     for (i=0; i<nrqs; i++) {
154       j         = pa[i];
155       iptr     +=  ict;
156       outdat[j] = iptr;
157       ict       = w1[j];
158     }
159   }
160 
161   /* Form the outgoing messages */
162   /*plug in the headers*/
163   for (i=0; i<nrqs; i++) {
164     j            = pa[i];
165     outdat[j][0] = 0;
166     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
167     ptr[j]       = outdat[j] + 2*w3[j] + 1;
168   }
169 
170   /* Memory for doing local proc's work*/
171   {
172     int  *d_p;
173     char *t_p;
174 
175     len   = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
176       (m)*imax*sizeof(int)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
177     ierr  = PetscMalloc(len,&table);CHKERRQ(ierr);
178     ierr  = PetscMemzero(table,len);CHKERRQ(ierr);
179     data  = (int **)(table + imax);
180     isz   = (int  *)(data  + imax);
181     d_p   = (int  *)(isz   + imax);
182     t_p   = (char *)(d_p   + m*imax);
183     for (i=0; i<imax; i++) {
184       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
185       data[i]  = d_p + (m)*i;
186     }
187   }
188 
189   /* Parse the IS and update local tables and the outgoing buf with the data*/
190   {
191     int     n_i,*data_i,isz_i,*outdat_j,ctr_j;
192     PetscBT table_i;
193 
194     for (i=0; i<imax; i++) {
195       ierr    = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
196       n_i     = n[i];
197       table_i = table[i];
198       idx_i   = idx[i];
199       data_i  = data[i];
200       isz_i   = isz[i];
201       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
202         row  = idx_i[j];
203         proc = rtable[row];
204         if (proc != rank) { /* copy to the outgoing buffer */
205           ctr[proc]++;
206           *ptr[proc] = row;
207           ptr[proc]++;
208         } else { /* Update the local table */
209           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
210         }
211       }
212       /* Update the headers for the current IS */
213       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
214         if ((ctr_j = ctr[j])) {
215           outdat_j        = outdat[j];
216           k               = ++outdat_j[0];
217           outdat_j[2*k]   = ctr_j;
218           outdat_j[2*k-1] = i;
219         }
220       }
221       isz[i] = isz_i;
222     }
223   }
224 
225 
226 
227   /*  Now  post the sends */
228   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
229   for (i=0; i<nrqs; ++i) {
230     j    = pa[i];
231     ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
232   }
233 
234   /* No longer need the original indices*/
235   for (i=0; i<imax; ++i) {
236     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
237   }
238   ierr = PetscFree(idx);CHKERRQ(ierr);
239 
240   for (i=0; i<imax; ++i) {
241     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
242   }
243 
244   /* Do Local work*/
245   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
246 
247   /* Receive messages*/
248   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
249   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);
250 
251   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
252   ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);
253 
254   /* Phase 1 sends are complete - deallocate buffers */
255   ierr = PetscFree(outdat);CHKERRQ(ierr);
256   ierr = PetscFree(w1);CHKERRQ(ierr);
257 
258   ierr = PetscMalloc((nrqr+1)*sizeof(int*),&xdata);CHKERRQ(ierr);
259   ierr = PetscMalloc((nrqr+1)*sizeof(int),&isz1);CHKERRQ(ierr);
260   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
261   ierr = PetscFree(rbuf);CHKERRQ(ierr);
262 
263 
264  /* Send the data back*/
265   /* Do a global reduction to know the buffer space req for incoming messages*/
266   {
267     int *rw1;
268 
269     ierr = PetscMalloc(size*sizeof(int),&rw1);CHKERRQ(ierr);
270     ierr = PetscMemzero(rw1,size*sizeof(int));CHKERRQ(ierr);
271 
272     for (i=0; i<nrqr; ++i) {
273       proc      = recv_status[i].MPI_SOURCE;
274       if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
275       rw1[proc] = isz1[i];
276     }
277     ierr = PetscFree(onodes1);CHKERRQ(ierr);
278     ierr = PetscFree(olengths1);CHKERRQ(ierr);
279 
280     /* Determine the number of messages to expect, their lengths, from from-ids */
281     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
282     PetscFree(rw1);
283   }
284   /* Now post the Irecvs corresponding to these messages */
285   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
286 
287   /*  Now  post the sends */
288   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
289   for (i=0; i<nrqr; ++i) {
290     j    = recv_status[i].MPI_SOURCE;
291     ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
292   }
293 
294   /* receive work done on other processors*/
295   {
296     int         idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
297     PetscBT     table_i;
298     MPI_Status  *status2;
299 
300     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
301     for (i=0; i<nrqs; ++i) {
302       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
303       /* Process the message*/
304       rbuf2_i = rbuf2[idex];
305       ct1     = 2*rbuf2_i[0]+1;
306       jmax    = rbuf2[idex][0];
307       for (j=1; j<=jmax; j++) {
308         max     = rbuf2_i[2*j];
309         is_no   = rbuf2_i[2*j-1];
310         isz_i   = isz[is_no];
311         data_i  = data[is_no];
312         table_i = table[is_no];
313         for (k=0; k<max; k++,ct1++) {
314           row = rbuf2_i[ct1];
315           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
316         }
317         isz[is_no] = isz_i;
318       }
319     }
320 
321     ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);
322     ierr = PetscFree(status2);CHKERRQ(ierr);
323   }
324 
325   for (i=0; i<imax; ++i) {
326     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
327   }
328 
329   ierr = PetscFree(onodes2);CHKERRQ(ierr);
330   ierr = PetscFree(olengths2);CHKERRQ(ierr);
331 
332   ierr = PetscFree(pa);CHKERRQ(ierr);
333   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
334   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
335   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
337   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(table);CHKERRQ(ierr);
339   ierr = PetscFree(s_status);CHKERRQ(ierr);
340   ierr = PetscFree(recv_status);CHKERRQ(ierr);
341   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
342   ierr = PetscFree(xdata);CHKERRQ(ierr);
343   ierr = PetscFree(isz1);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
349 /*
350    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
351        the work on the local processor.
352 
353      Inputs:
354       C      - MAT_MPIAIJ;
355       imax - total no of index sets processed at a time;
356       table  - an array of char - size = m bits.
357 
358      Output:
359       isz    - array containing the count of the solution elements correspondign
360                to each index set;
361       data   - pointer to the solutions
362 */
363 static int MatIncreaseOverlap_MPIAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
364 {
365   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
366   Mat        A = c->A,B = c->B;
367   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
368   int        start,end,val,max,rstart,cstart,*ai,*aj;
369   int        *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
370   PetscBT    table_i;
371 
372   PetscFunctionBegin;
373   rstart = c->rstart;
374   cstart = c->cstart;
375   ai     = a->i;
376   aj     = a->j;
377   bi     = b->i;
378   bj     = b->j;
379   garray = c->garray;
380 
381 
382   for (i=0; i<imax; i++) {
383     data_i  = data[i];
384     table_i = table[i];
385     isz_i   = isz[i];
386     for (j=0,max=isz[i]; j<max; j++) {
387       row   = data_i[j] - rstart;
388       start = ai[row];
389       end   = ai[row+1];
390       for (k=start; k<end; k++) { /* Amat */
391         val = aj[k] + cstart;
392         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
393       }
394       start = bi[row];
395       end   = bi[row+1];
396       for (k=start; k<end; k++) { /* Bmat */
397         val = garray[bj[k]];
398         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399       }
400     }
401     isz[i] = isz_i;
402   }
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
408 /*
409       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
410          and return the output
411 
412          Input:
413            C    - the matrix
414            nrqr - no of messages being processed.
415            rbuf - an array of pointers to the recieved requests
416 
417          Output:
418            xdata - array of messages to be sent back
419            isz1  - size of each message
420 
421   For better efficiency perhaps we should malloc seperately each xdata[i],
422 then if a remalloc is required we need only copy the data for that one row
423 rather then all previous rows as it is now where a single large chunck of
424 memory is used.
425 
426 */
427 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
428 {
429   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
430   Mat         A = c->A,B = c->B;
431   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
432   int         rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
433   int         row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
434   int         val,max1,max2,rank,m,no_malloc =0,*tmp,new_estimate,ctr;
435   int         *rbuf_i,kmax,rbuf_0,ierr;
436   PetscBT     xtable;
437 
438   PetscFunctionBegin;
439   rank   = c->rank;
440   m      = C->M;
441   rstart = c->rstart;
442   cstart = c->cstart;
443   ai     = a->i;
444   aj     = a->j;
445   bi     = b->i;
446   bj     = b->j;
447   garray = c->garray;
448 
449 
450   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
451     rbuf_i  =  rbuf[i];
452     rbuf_0  =  rbuf_i[0];
453     ct     += rbuf_0;
454     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
455   }
456 
457   if (C->m) max1 = ct*(a->nz + b->nz)/C->m;
458   else      max1 = 1;
459   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
460   ierr         = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr);
461   ++no_malloc;
462   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
463   ierr         = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr);
464 
465   ct3 = 0;
466   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
467     rbuf_i =  rbuf[i];
468     rbuf_0 =  rbuf_i[0];
469     ct1    =  2*rbuf_0+1;
470     ct2    =  ct1;
471     ct3    += ct1;
472     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
473       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
474       oct2 = ct2;
475       kmax = rbuf_i[2*j];
476       for (k=0; k<kmax; k++,ct1++) {
477         row = rbuf_i[ct1];
478         if (!PetscBTLookupSet(xtable,row)) {
479           if (!(ct3 < mem_estimate)) {
480             new_estimate = (int)(1.5*mem_estimate)+1;
481             ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
482             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
483             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
484             xdata[0]     = tmp;
485             mem_estimate = new_estimate; ++no_malloc;
486             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
487           }
488           xdata[i][ct2++] = row;
489           ct3++;
490         }
491       }
492       for (k=oct2,max2=ct2; k<max2; k++) {
493         row   = xdata[i][k] - rstart;
494         start = ai[row];
495         end   = ai[row+1];
496         for (l=start; l<end; l++) {
497           val = aj[l] + cstart;
498           if (!PetscBTLookupSet(xtable,val)) {
499             if (!(ct3 < mem_estimate)) {
500               new_estimate = (int)(1.5*mem_estimate)+1;
501               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
502               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
503               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
504               xdata[0]     = tmp;
505               mem_estimate = new_estimate; ++no_malloc;
506               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
507             }
508             xdata[i][ct2++] = val;
509             ct3++;
510           }
511         }
512         start = bi[row];
513         end   = bi[row+1];
514         for (l=start; l<end; l++) {
515           val = garray[bj[l]];
516           if (!PetscBTLookupSet(xtable,val)) {
517             if (!(ct3 < mem_estimate)) {
518               new_estimate = (int)(1.5*mem_estimate)+1;
519               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
520               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
521               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
522               xdata[0]     = tmp;
523               mem_estimate = new_estimate; ++no_malloc;
524               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
525             }
526             xdata[i][ct2++] = val;
527             ct3++;
528           }
529         }
530       }
531       /* Update the header*/
532       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
533       xdata[i][2*j-1] = rbuf_i[2*j-1];
534     }
535     xdata[i][0] = rbuf_0;
536     xdata[i+1]  = xdata[i] + ct2;
537     isz1[i]     = ct2; /* size of each message */
538   }
539   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
540   PetscLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc);
541   PetscFunctionReturn(0);
542 }
543 /* -------------------------------------------------------------------------*/
544 EXTERN int MatGetSubMatrices_MPIAIJ_Local(Mat,int,const IS[],const IS[],MatReuse,Mat*);
545 EXTERN int MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
546 /*
547     Every processor gets the entire matrix
548 */
549 #undef __FUNCT__
550 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
551 int MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat *Bin[])
552 {
553   Mat          B;
554   Mat_MPIAIJ   *a = (Mat_MPIAIJ *)A->data;
555   Mat_SeqAIJ   *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
556   int          ierr,sendcount,*recvcounts = 0,*displs = 0,size,i,*rstarts = a->rowners,rank,n,cnt,j;
557   int          m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
558   PetscScalar  *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
559 
560   PetscFunctionBegin;
561   ierr = MPI_Comm_size(A->comm,&size);CHKERRQ(ierr);
562   ierr = MPI_Comm_rank(A->comm,&rank);CHKERRQ(ierr);
563 
564   if (scall == MAT_INITIAL_MATRIX) {
565     /* ----------------------------------------------------------------
566          Tell every processor the number of nonzeros per row
567     */
568     ierr = PetscMalloc(A->M*sizeof(int),&lens);CHKERRQ(ierr);
569     for (i=a->rstart; i<a->rend; i++) {
570       lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart];
571     }
572     sendcount = a->rend - a->rstart;
573     ierr = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
574     displs     = recvcounts + size;
575     for (i=0; i<size; i++) {
576       recvcounts[i] = a->rowners[i+1] - a->rowners[i];
577       displs[i]     = a->rowners[i];
578     }
579     ierr  = MPI_Allgatherv(lens+a->rstart,sendcount,MPI_INT,lens,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
580 
581     /* ---------------------------------------------------------------
582          Create the sequential matrix of the same type as the local block diagonal
583     */
584     ierr  = MatCreate(PETSC_COMM_SELF,A->M,A->N,PETSC_DETERMINE,PETSC_DETERMINE,&B);CHKERRQ(ierr);
585     ierr  = MatSetType(B,a->A->type_name);CHKERRQ(ierr);
586     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
587     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
588     **Bin = B;
589     b = (Mat_SeqAIJ *)B->data;
590 
591     /*--------------------------------------------------------------------
592        Copy my part of matrix column indices over
593     */
594     sendcount  = ad->nz + bd->nz;
595     jsendbuf   = b->j + b->i[rstarts[rank]];
596     a_jsendbuf = ad->j;
597     b_jsendbuf = bd->j;
598     n          = a->rend - a->rstart;
599     cnt        = 0;
600     for (i=0; i<n; i++) {
601 
602       /* put in lower diagonal portion */
603       m = bd->i[i+1] - bd->i[i];
604       while (m > 0) {
605         /* is it above diagonal (in bd (compressed) numbering) */
606         if (garray[*b_jsendbuf] > a->rstart + i) break;
607         jsendbuf[cnt++] = garray[*b_jsendbuf++];
608         m--;
609       }
610 
611       /* put in diagonal portion */
612       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
613         jsendbuf[cnt++] = a->rstart + *a_jsendbuf++;
614       }
615 
616       /* put in upper diagonal portion */
617       while (m-- > 0) {
618         jsendbuf[cnt++] = garray[*b_jsendbuf++];
619       }
620     }
621     if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
622 
623     /*--------------------------------------------------------------------
624        Gather all column indices to all processors
625     */
626     for (i=0; i<size; i++) {
627       recvcounts[i] = 0;
628       for (j=a->rowners[i]; j<a->rowners[i+1]; j++) {
629         recvcounts[i] += lens[j];
630       }
631     }
632     displs[0]  = 0;
633     for (i=1; i<size; i++) {
634       displs[i] = displs[i-1] + recvcounts[i-1];
635     }
636     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPI_INT,b->j,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
637 
638     /*--------------------------------------------------------------------
639         Assemble the matrix into useable form (note numerical values not yet set)
640     */
641     /* set the b->ilen (length of each row) values */
642     ierr = PetscMemcpy(b->ilen,lens,A->M*sizeof(int));CHKERRQ(ierr);
643     /* set the b->i indices */
644     b->i[0] = 0;
645     for (i=1; i<=A->M; i++) {
646       b->i[i] = b->i[i-1] + lens[i-1];
647     }
648     ierr = PetscFree(lens);CHKERRQ(ierr);
649     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
650     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
651 
652   } else {
653     B  = **Bin;
654     b = (Mat_SeqAIJ *)B->data;
655   }
656 
657   /*--------------------------------------------------------------------
658        Copy my part of matrix numerical values into the values location
659   */
660   sendcount = ad->nz + bd->nz;
661   sendbuf   = b->a + b->i[rstarts[rank]];
662   a_sendbuf = ad->a;
663   b_sendbuf = bd->a;
664   b_sendj   = bd->j;
665   n         = a->rend - a->rstart;
666   cnt       = 0;
667   for (i=0; i<n; i++) {
668 
669     /* put in lower diagonal portion */
670     m = bd->i[i+1] - bd->i[i];
671     while (m > 0) {
672       /* is it above diagonal (in bd (compressed) numbering) */
673       if (garray[*b_sendj] > a->rstart + i) break;
674       sendbuf[cnt++] = *b_sendbuf++;
675       m--;
676       b_sendj++;
677     }
678 
679     /* put in diagonal portion */
680     for (j=ad->i[i]; j<ad->i[i+1]; j++) {
681       sendbuf[cnt++] = *a_sendbuf++;
682     }
683 
684     /* put in upper diagonal portion */
685     while (m-- > 0) {
686       sendbuf[cnt++] = *b_sendbuf++;
687       b_sendj++;
688     }
689   }
690   if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
691 
692   /* -----------------------------------------------------------------
693      Gather all numerical values to all processors
694   */
695   if (!recvcounts) {
696     ierr   = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
697     displs = recvcounts + size;
698   }
699   for (i=0; i<size; i++) {
700     recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
701   }
702   displs[0]  = 0;
703   for (i=1; i<size; i++) {
704     displs[i] = displs[i-1] + recvcounts[i-1];
705   }
706   recvbuf   = b->a;
707   ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);CHKERRQ(ierr);
708   ierr = PetscFree(recvcounts);CHKERRQ(ierr);
709   if (A->symmetric){
710     ierr = MatSetOption(B,MAT_SYMMETRIC);CHKERRQ(ierr);
711   } else if (A->hermitian) {
712     ierr = MatSetOption(B,MAT_HERMITIAN);CHKERRQ(ierr);
713   } else if (A->structurally_symmetric) {
714     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC);CHKERRQ(ierr);
715   }
716 
717   PetscFunctionReturn(0);
718 }
719 
720 #undef __FUNCT__
721 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
722 int MatGetSubMatrices_MPIAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
723 {
724   int         nmax,nstages_local,nstages,i,pos,max_no,ierr,nrow,ncol;
725   PetscTruth  rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
726 
727   PetscFunctionBegin;
728   /*
729        Check for special case each processor gets entire matrix
730   */
731   if (ismax == 1 && C->M == C->N) {
732     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
733     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
734     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
735     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
736     if (rowflag && colflag && nrow == C->M && ncol == C->N) {
737       wantallmatrix = PETSC_TRUE;
738       ierr = PetscOptionsGetLogical(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
739     }
740   }
741   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);CHKERRQ(ierr);
742   if (twantallmatrix) {
743     ierr = MatGetSubMatrix_MPIAIJ_All(C,scall,submat);CHKERRQ(ierr);
744     PetscFunctionReturn(0);
745   }
746 
747   /* Allocate memory to hold all the submatrices */
748   if (scall != MAT_REUSE_MATRIX) {
749     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
750   }
751   /* Determine the number of stages through which submatrices are done */
752   nmax          = 20*1000000 / (C->N * sizeof(int));
753   if (!nmax) nmax = 1;
754   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
755 
756   /* Make sure every processor loops through the nstages */
757   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr);
758 
759   for (i=0,pos=0; i<nstages; i++) {
760     if (pos+nmax <= ismax) max_no = nmax;
761     else if (pos == ismax) max_no = 0;
762     else                   max_no = ismax-pos;
763     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
764     pos += max_no;
765   }
766   PetscFunctionReturn(0);
767 }
768 /* -------------------------------------------------------------------------*/
769 #undef __FUNCT__
770 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
771 int MatGetSubMatrices_MPIAIJ_Local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
772 {
773   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
774   Mat         A = c->A;
775   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
776   int         **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
777   int         **sbuf1,**sbuf2,rank,m,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
778   int         nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
779   int         **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
780   int         **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
781   int         len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
782   int         *rmap_i,tag0,tag1,tag2,tag3;
783   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
784   MPI_Request *r_waits4,*s_waits3,*s_waits4;
785   MPI_Status  *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
786   MPI_Status  *r_status3,*r_status4,*s_status4;
787   MPI_Comm    comm;
788   PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
789   PetscTruth  sorted;
790   int         *onodes1,*olengths1;
791 
792   PetscFunctionBegin;
793   comm   = C->comm;
794   tag0   = C->tag;
795   size   = c->size;
796   rank   = c->rank;
797   m      = C->M;
798 
799   /* Get some new tags to keep the communication clean */
800   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
801   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
802   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
803 
804     /* Check if the col indices are sorted */
805   for (i=0; i<ismax; i++) {
806     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
807     if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
808     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
809     /*    if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); */
810   }
811 
812   len    = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (m+1)*sizeof(int);
813   ierr   = PetscMalloc(len,&irow);CHKERRQ(ierr);
814   icol   = irow + ismax;
815   nrow   = (int*)(icol + ismax);
816   ncol   = nrow + ismax;
817   rtable = ncol + ismax;
818 
819   for (i=0; i<ismax; i++) {
820     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
821     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
822     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
823     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
824   }
825 
826   /* Create hash table for the mapping :row -> proc*/
827   for (i=0,j=0; i<size; i++) {
828     jmax = c->rowners[i+1];
829     for (; j<jmax; j++) {
830       rtable[j] = i;
831     }
832   }
833 
834   /* evaluate communication - mesg to who, length of mesg, and buffer space
835      required. Based on this, buffers are allocated, and data copied into them*/
836   ierr   = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr); /* mesg size */
837   w2     = w1 + size;      /* if w2[i] marked, then a message to proc i*/
838   w3     = w2 + size;      /* no of IS that needs to be sent to proc i */
839   w4     = w3 + size;      /* temp work space used in determining w1, w2, w3 */
840   ierr   = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
841   for (i=0; i<ismax; i++) {
842     ierr   = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
843     jmax   = nrow[i];
844     irow_i = irow[i];
845     for (j=0; j<jmax; j++) {
846       row  = irow_i[j];
847       proc = rtable[row];
848       w4[proc]++;
849     }
850     for (j=0; j<size; j++) {
851       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
852     }
853   }
854 
855   nrqs     = 0;              /* no of outgoing messages */
856   msz      = 0;              /* total mesg length (for all procs) */
857   w1[rank] = 0;              /* no mesg sent to self */
858   w3[rank] = 0;
859   for (i=0; i<size; i++) {
860     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
861   }
862   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); /*(proc -array)*/
863   for (i=0,j=0; i<size; i++) {
864     if (w1[i]) { pa[j] = i; j++; }
865   }
866 
867   /* Each message would have a header = 1 + 2*(no of IS) + data */
868   for (i=0; i<nrqs; i++) {
869     j     = pa[i];
870     w1[j] += w2[j] + 2* w3[j];
871     msz   += w1[j];
872   }
873 
874   /* Determine the number of messages to expect, their lengths, from from-ids */
875   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
876   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
877 
878   /* Now post the Irecvs corresponding to these messages */
879   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
880 
881   ierr = PetscFree(onodes1);CHKERRQ(ierr);
882   ierr = PetscFree(olengths1);CHKERRQ(ierr);
883 
884   /* Allocate Memory for outgoing messages */
885   len      = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
886   ierr     = PetscMalloc(len,&sbuf1);CHKERRQ(ierr);
887   ptr      = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
888   ierr     = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr);
889   /* allocate memory for outgoing data + buf to receive the first reply */
890   tmp      = (int*)(ptr + size);
891   ctr      = tmp + 2*msz;
892 
893   {
894     int *iptr = tmp,ict = 0;
895     for (i=0; i<nrqs; i++) {
896       j         = pa[i];
897       iptr     += ict;
898       sbuf1[j]  = iptr;
899       ict       = w1[j];
900     }
901   }
902 
903   /* Form the outgoing messages */
904   /* Initialize the header space */
905   for (i=0; i<nrqs; i++) {
906     j           = pa[i];
907     sbuf1[j][0] = 0;
908     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
909     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
910   }
911 
912   /* Parse the isrow and copy data into outbuf */
913   for (i=0; i<ismax; i++) {
914     ierr   = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
915     irow_i = irow[i];
916     jmax   = nrow[i];
917     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
918       row  = irow_i[j];
919       proc = rtable[row];
920       if (proc != rank) { /* copy to the outgoing buf*/
921         ctr[proc]++;
922         *ptr[proc] = row;
923         ptr[proc]++;
924       }
925     }
926     /* Update the headers for the current IS */
927     for (j=0; j<size; j++) { /* Can Optimise this loop too */
928       if ((ctr_j = ctr[j])) {
929         sbuf1_j        = sbuf1[j];
930         k              = ++sbuf1_j[0];
931         sbuf1_j[2*k]   = ctr_j;
932         sbuf1_j[2*k-1] = i;
933       }
934     }
935   }
936 
937   /*  Now  post the sends */
938   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
939   for (i=0; i<nrqs; ++i) {
940     j    = pa[i];
941     ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
942   }
943 
944   /* Post Receives to capture the buffer size */
945   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
946   ierr     = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf2);CHKERRQ(ierr);
947   rbuf2[0] = tmp + msz;
948   for (i=1; i<nrqs; ++i) {
949     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
950   }
951   for (i=0; i<nrqs; ++i) {
952     j    = pa[i];
953     ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
954   }
955 
956   /* Send to other procs the buf size they should allocate */
957 
958 
959   /* Receive messages*/
960   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
961   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
962   len         = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
963   ierr        = PetscMalloc(len,&sbuf2);CHKERRQ(ierr);
964   req_size    = (int*)(sbuf2 + nrqr);
965   req_source  = req_size + nrqr;
966 
967   {
968     Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
969     int        *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart;
970     int        *sbuf2_i;
971 
972     for (i=0; i<nrqr; ++i) {
973       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
974       req_size[idex] = 0;
975       rbuf1_i         = rbuf1[idex];
976       start           = 2*rbuf1_i[0] + 1;
977       ierr            = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr);
978       ierr            = PetscMalloc((end+1)*sizeof(int),&sbuf2[idex]);CHKERRQ(ierr);
979       sbuf2_i         = sbuf2[idex];
980       for (j=start; j<end; j++) {
981         id               = rbuf1_i[j] - rstart;
982         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
983         sbuf2_i[j]       = ncols;
984         req_size[idex] += ncols;
985       }
986       req_source[idex] = r_status1[i].MPI_SOURCE;
987       /* form the header */
988       sbuf2_i[0]   = req_size[idex];
989       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
990       ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
991     }
992   }
993   ierr = PetscFree(r_status1);CHKERRQ(ierr);
994   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
995 
996   /*  recv buffer sizes */
997   /* Receive messages*/
998 
999   ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);CHKERRQ(ierr);
1000   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1001   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1002   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1003   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1004 
1005   for (i=0; i<nrqs; ++i) {
1006     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1007     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(int),&rbuf3[idex]);CHKERRQ(ierr);
1008     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1009     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1010     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1011   }
1012   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1013   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1014 
1015   /* Wait on sends1 and sends2 */
1016   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1017   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1018 
1019   ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1020   ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1021   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1022   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1023   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1024   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1025 
1026   /* Now allocate buffers for a->j, and send them off */
1027   ierr = PetscMalloc((nrqr+1)*sizeof(int*),&sbuf_aj);CHKERRQ(ierr);
1028   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1029   ierr = PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);CHKERRQ(ierr);
1030   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1031 
1032   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1033   {
1034     int nzA,nzB,*a_i = a->i,*b_i = b->i,imark;
1035     int *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1036     int *a_j = a->j,*b_j = b->j,ctmp;
1037 
1038     for (i=0; i<nrqr; i++) {
1039       rbuf1_i   = rbuf1[i];
1040       sbuf_aj_i = sbuf_aj[i];
1041       ct1       = 2*rbuf1_i[0] + 1;
1042       ct2       = 0;
1043       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1044         kmax = rbuf1[i][2*j];
1045         for (k=0; k<kmax; k++,ct1++) {
1046           row    = rbuf1_i[ct1] - rstart;
1047           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1048           ncols  = nzA + nzB;
1049           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1050 
1051           /* load the column indices for this row into cols*/
1052           cols  = sbuf_aj_i + ct2;
1053 
1054           for (l=0; l<nzB; l++) {
1055             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
1056             else break;
1057           }
1058           imark = l;
1059           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
1060           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1061 
1062           ct2 += ncols;
1063         }
1064       }
1065       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1066     }
1067   }
1068   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1069   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1070 
1071   /* Allocate buffers for a->a, and send them off */
1072   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1073   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1074   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1075   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1076 
1077   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1078   {
1079     int    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,imark;
1080     int    cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1081     int    *b_j = b->j;
1082     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1083 
1084     for (i=0; i<nrqr; i++) {
1085       rbuf1_i   = rbuf1[i];
1086       sbuf_aa_i = sbuf_aa[i];
1087       ct1       = 2*rbuf1_i[0]+1;
1088       ct2       = 0;
1089       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1090         kmax = rbuf1_i[2*j];
1091         for (k=0; k<kmax; k++,ct1++) {
1092           row    = rbuf1_i[ct1] - rstart;
1093           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1094           ncols  = nzA + nzB;
1095           cworkB = b_j + b_i[row];
1096           vworkA = a_a + a_i[row];
1097           vworkB = b_a + b_i[row];
1098 
1099           /* load the column values for this row into vals*/
1100           vals  = sbuf_aa_i+ct2;
1101 
1102           for (l=0; l<nzB; l++) {
1103             if ((bmap[cworkB[l]]) < cstart)  vals[l] = vworkB[l];
1104             else break;
1105           }
1106           imark = l;
1107           for (l=0; l<nzA; l++)   vals[imark+l] = vworkA[l];
1108           for (l=imark; l<nzB; l++) vals[nzA+l] = vworkB[l];
1109 
1110           ct2 += ncols;
1111         }
1112       }
1113       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1114     }
1115   }
1116   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1117   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1118   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1119 
1120   /* Form the matrix */
1121   /* create col map */
1122   {
1123     int *icol_i;
1124 
1125     len     = (1+ismax)*sizeof(int*)+ ismax*C->N*sizeof(int);
1126     ierr    = PetscMalloc(len,&cmap);CHKERRQ(ierr);
1127     cmap[0] = (int*)(cmap + ismax);
1128     ierr    = PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(int));CHKERRQ(ierr);
1129     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->N; }
1130     for (i=0; i<ismax; i++) {
1131       jmax   = ncol[i];
1132       icol_i = icol[i];
1133       cmap_i = cmap[i];
1134       for (j=0; j<jmax; j++) {
1135         cmap_i[icol_i[j]] = j+1;
1136       }
1137     }
1138   }
1139 
1140   /* Create lens which is required for MatCreate... */
1141   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1142   len     = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1143   ierr    = PetscMalloc(len,&lens);CHKERRQ(ierr);
1144   lens[0] = (int*)(lens + ismax);
1145   ierr    = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr);
1146   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1147 
1148   /* Update lens from local data */
1149   for (i=0; i<ismax; i++) {
1150     jmax   = nrow[i];
1151     cmap_i = cmap[i];
1152     irow_i = irow[i];
1153     lens_i = lens[i];
1154     for (j=0; j<jmax; j++) {
1155       row  = irow_i[j];
1156       proc = rtable[row];
1157       if (proc == rank) {
1158         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1159         for (k=0; k<ncols; k++) {
1160           if (cmap_i[cols[k]]) { lens_i[j]++;}
1161         }
1162         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1163       }
1164     }
1165   }
1166 
1167   /* Create row map*/
1168   len     = (1+ismax)*sizeof(int*)+ ismax*C->M*sizeof(int);
1169   ierr    = PetscMalloc(len,&rmap);CHKERRQ(ierr);
1170   rmap[0] = (int*)(rmap + ismax);
1171   ierr    = PetscMemzero(rmap[0],ismax*C->M*sizeof(int));CHKERRQ(ierr);
1172   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->M;}
1173   for (i=0; i<ismax; i++) {
1174     rmap_i = rmap[i];
1175     irow_i = irow[i];
1176     jmax   = nrow[i];
1177     for (j=0; j<jmax; j++) {
1178       rmap_i[irow_i[j]] = j;
1179     }
1180   }
1181 
1182   /* Update lens from offproc data */
1183   {
1184     int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1185 
1186     for (tmp2=0; tmp2<nrqs; tmp2++) {
1187       ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr);
1188       idex   = pa[i];
1189       sbuf1_i = sbuf1[idex];
1190       jmax    = sbuf1_i[0];
1191       ct1     = 2*jmax+1;
1192       ct2     = 0;
1193       rbuf2_i = rbuf2[i];
1194       rbuf3_i = rbuf3[i];
1195       for (j=1; j<=jmax; j++) {
1196         is_no   = sbuf1_i[2*j-1];
1197         max1    = sbuf1_i[2*j];
1198         lens_i  = lens[is_no];
1199         cmap_i  = cmap[is_no];
1200         rmap_i  = rmap[is_no];
1201         for (k=0; k<max1; k++,ct1++) {
1202           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1203           max2 = rbuf2_i[ct1];
1204           for (l=0; l<max2; l++,ct2++) {
1205             if (cmap_i[rbuf3_i[ct2]]) {
1206               lens_i[row]++;
1207             }
1208           }
1209         }
1210       }
1211     }
1212   }
1213   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1214   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1215   ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1216   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1217   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1218 
1219   /* Create the submatrices */
1220   if (scall == MAT_REUSE_MATRIX) {
1221     PetscTruth flag;
1222 
1223     /*
1224         Assumes new rows are same length as the old rows,hence bug!
1225     */
1226     for (i=0; i<ismax; i++) {
1227       mat = (Mat_SeqAIJ *)(submats[i]->data);
1228       if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) {
1229         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1230       }
1231       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(int),&flag);CHKERRQ(ierr);
1232       if (flag == PETSC_FALSE) {
1233         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1234       }
1235       /* Initial matrix as if empty */
1236       ierr = PetscMemzero(mat->ilen,submats[i]->m*sizeof(int));CHKERRQ(ierr);
1237       submats[i]->factor = C->factor;
1238     }
1239   } else {
1240     for (i=0; i<ismax; i++) {
1241       ierr = MatCreate(PETSC_COMM_SELF,nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE,submats+i);CHKERRQ(ierr);
1242       ierr = MatSetType(submats[i],A->type_name);CHKERRQ(ierr);
1243       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1244     }
1245   }
1246 
1247   /* Assemble the matrices */
1248   /* First assemble the local rows */
1249   {
1250     int    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1251     PetscScalar *imat_a;
1252 
1253     for (i=0; i<ismax; i++) {
1254       mat       = (Mat_SeqAIJ*)submats[i]->data;
1255       imat_ilen = mat->ilen;
1256       imat_j    = mat->j;
1257       imat_i    = mat->i;
1258       imat_a    = mat->a;
1259       cmap_i    = cmap[i];
1260       rmap_i    = rmap[i];
1261       irow_i    = irow[i];
1262       jmax      = nrow[i];
1263       for (j=0; j<jmax; j++) {
1264         row      = irow_i[j];
1265         proc     = rtable[row];
1266         if (proc == rank) {
1267           old_row  = row;
1268           row      = rmap_i[row];
1269           ilen_row = imat_ilen[row];
1270           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1271           mat_i    = imat_i[row] ;
1272           mat_a    = imat_a + mat_i;
1273           mat_j    = imat_j + mat_i;
1274           for (k=0; k<ncols; k++) {
1275             if ((tcol = cmap_i[cols[k]])) {
1276               *mat_j++ = tcol - 1;
1277               *mat_a++ = vals[k];
1278               ilen_row++;
1279             }
1280           }
1281           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1282           imat_ilen[row] = ilen_row;
1283         }
1284       }
1285     }
1286   }
1287 
1288   /*   Now assemble the off proc rows*/
1289   {
1290     int    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1291     int    *imat_j,*imat_i;
1292     PetscScalar *imat_a,*rbuf4_i;
1293 
1294     for (tmp2=0; tmp2<nrqs; tmp2++) {
1295       ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr);
1296       idex   = pa[i];
1297       sbuf1_i = sbuf1[idex];
1298       jmax    = sbuf1_i[0];
1299       ct1     = 2*jmax + 1;
1300       ct2     = 0;
1301       rbuf2_i = rbuf2[i];
1302       rbuf3_i = rbuf3[i];
1303       rbuf4_i = rbuf4[i];
1304       for (j=1; j<=jmax; j++) {
1305         is_no     = sbuf1_i[2*j-1];
1306         rmap_i    = rmap[is_no];
1307         cmap_i    = cmap[is_no];
1308         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1309         imat_ilen = mat->ilen;
1310         imat_j    = mat->j;
1311         imat_i    = mat->i;
1312         imat_a    = mat->a;
1313         max1      = sbuf1_i[2*j];
1314         for (k=0; k<max1; k++,ct1++) {
1315           row   = sbuf1_i[ct1];
1316           row   = rmap_i[row];
1317           ilen  = imat_ilen[row];
1318           mat_i = imat_i[row] ;
1319           mat_a = imat_a + mat_i;
1320           mat_j = imat_j + mat_i;
1321           max2 = rbuf2_i[ct1];
1322           for (l=0; l<max2; l++,ct2++) {
1323             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1324               *mat_j++ = tcol - 1;
1325               *mat_a++ = rbuf4_i[ct2];
1326               ilen++;
1327             }
1328           }
1329           imat_ilen[row] = ilen;
1330         }
1331       }
1332     }
1333   }
1334   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1335   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1336   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1337   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1338   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1339 
1340   /* Restore the indices */
1341   for (i=0; i<ismax; i++) {
1342     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1343     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1344   }
1345 
1346   /* Destroy allocated memory */
1347   ierr = PetscFree(irow);CHKERRQ(ierr);
1348   ierr = PetscFree(w1);CHKERRQ(ierr);
1349   ierr = PetscFree(pa);CHKERRQ(ierr);
1350 
1351   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1352   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1353   for (i=0; i<nrqr; ++i) {
1354     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1355   }
1356   for (i=0; i<nrqs; ++i) {
1357     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1358     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1359   }
1360 
1361   ierr = PetscFree(sbuf2);CHKERRQ(ierr);
1362   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1363   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1364   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1365   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1366   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1367   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1368 
1369   ierr = PetscFree(cmap);CHKERRQ(ierr);
1370   ierr = PetscFree(rmap);CHKERRQ(ierr);
1371   ierr = PetscFree(lens);CHKERRQ(ierr);
1372 
1373   for (i=0; i<ismax; i++) {
1374     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1375     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1376   }
1377   PetscFunctionReturn(0);
1378 }
1379 
1380